-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocess.sh
More file actions
30 lines (27 loc) · 797 Bytes
/
preprocess.sh
File metadata and controls
30 lines (27 loc) · 797 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
TASK=grammarly
for SPLIT in test train val plain
do
for LANG in source target
do
python -m multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "$TASK/$SPLIT.$LANG" \
--outputs "$TASK/$SPLIT.bpe.$LANG" \
--workers 60 \
--keep-empty;
done
done
fairseq-preprocess \
--source-lang "source" \
--target-lang "target" \
--trainpref "${TASK}/train.bpe" \
--validpref "${TASK}/val.bpe" \
--testpref "${TASK}/plain.bpe" \
--destdir "${TASK}-bin/" \
--workers 60 \
--srcdict dict.txt \
--tgtdict dict.txt;