else:
vocabulary = None
- bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
+ bpe = BPE(args.codes, args.merges, args.separator.decode('utf-8'), vocabulary, args.glossaries)
for line in args.input:
args.output.write(bpe.process_line(line))
learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)
with codecs.open(args.output.name, encoding='UTF-8') as codes:
- bpe = apply_bpe.BPE(codes, separator=args.separator)
+ bpe = apply_bpe.BPE(codes, separator=args.separator.decode('utf-8'))
# apply BPE to each training corpus and get vocabulary
for train_file, vocab_file in zip(args.input, args.vocab):
args.output.write(word[i*args.n:i*args.n+args.n])
i += 1
if i*args.n < len(word):
- args.output.write(args.separator)
+ args.output.write(args.separator.decode('utf-8'))
args.output.write(' ')
else:
args.output.write(word + ' ')
if args.output.name != '<stdout>':
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
- segment_char_ngrams(args)
\ No newline at end of file
+ segment_char_ngrams(args)
else:
vocabulary = None
- bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
+ bpe = BPE(args.codes, args.merges, args.separator.decode('utf-8'), vocabulary, args.glossaries)
for line in args.input:
args.output.write(bpe.process_line(line))