big_stats[item] = freq
-def main(vocab, outfile, num_symbols, min_frequency=2, verbose=False):
+def main(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False):
"""Learn num_symbols BPE operations from vocabulary, and write to outfile.
"""
# version numbering allows bckward compatibility
outfile.write('#version: 0.2\n')
- vocab = get_vocabulary(args.input, is_dict = args.dict_input)
+ vocab = get_vocabulary(infile, is_dict)
vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
if args.output.name != '<stdout>':
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
- main(vocab, args.output, args.symbols, args.min_frequency, args.verbose)
+ main(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input)
full_vocab += learn_bpe.get_vocabulary(f)
f.seek(0)
+ vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]
+
# learn BPE on combined vocabulary
with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
- learn_bpe.main(full_vocab, output, args.symbols, args.min_frequency, args.verbose)
+ learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)
with codecs.open(args.output.name, encoding='UTF-8') as codes:
bpe = apply_bpe.BPE(codes, args.separator, None)