26 lines
743 B
Bash
26 lines
743 B
Bash
#!/bin/bash
|
|
###
|
|
# @Author: LiangSong(sl12160010@gmail.com)
|
|
# @Date: 2023-03-16 21:21:38
|
|
# @LastEditors: LiangSong(sl12160010@gmail.com)
|
|
# @LastEditTime: 2023-03-26 22:58:02
|
|
# @FilePath: /Open-Llama/data/download_the_pile.sh
|
|
# @Description:
|
|
# download the pile dataset and preprocess
|
|
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
|
###
|
|
start=0
|
|
end=29
|
|
mkdir data/the_pile
|
|
for (( i=$start; i<=$end; i++ ))
|
|
do
|
|
url="https://the-eye.eu/public/AI/pile/train/$(printf "%02d" $i).jsonl.zst"
|
|
echo "Downloading file: $url"
|
|
curl -C - $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst"
|
|
done
|
|
|
|
wait
|
|
|
|
echo "All files downloaded successfully."
|
|
mkdir data/pretrain_data
|
|
python3 data/preprocess_the_pile.py |