CHANGELOG
---------
+v0.3:
+ - library is now installable via pip
+ - fix occasional problems with UTF-8 whitespace and new lines in learn_bpe and apply_bpe.
+ - do not silently convert UTF-8 newline characters into "\n"
+ - do not silently convert UTF-8 whitespace characters into " "
+ - UTF-8 whitespace and newline characters are now considered part of a word, and segmented by BPE
+
v0.2:
- different, more consistent handling of end-of-word token (commit a749a7) (https://github.com/rsennrich/subword-nmt/issues/19)
- allow passing of vocabulary and frequency threshold to apply_bpe.py, preventing the production of OOV (or rare) subword units (commit a00db)
--- /dev/null
+subword_nmt/apply_bpe.py
\ No newline at end of file
--- /dev/null
+subword_nmt/get_vocab.py
\ No newline at end of file
--- /dev/null
+subword_nmt/learn_bpe.py
\ No newline at end of file
--- /dev/null
+subword_nmt/learn_joint_bpe_and_vocab.py
\ No newline at end of file
from __future__ import unicode_literals, division
import sys
+import os
+import inspect
import codecs
import io
import argparse
import re
+import warnings
# hack for python2/3 compatibility
from io import open
if __name__ == '__main__':
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+ newdir = os.path.join(currentdir, 'subword_nmt')
+ if os.path.isdir(newdir):
+ warnings.simplefilter('default')
+ warnings.warn(
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+ DeprecationWarning
+ )
+
# python 2/3 compatibility
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
"""
from __future__ import print_function, unicode_literals, division
+
import sys
import codecs
import io
import argparse
+
from collections import defaultdict
# hack for python2/3 compatibility
from io import open
argparse.open = open
-# python 2/3 compatibility
-if sys.version_info < (3, 0):
- sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
- sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
- sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
-
-
def create_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
if __name__ == '__main__':
+ # python 2/3 compatibility
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ else:
+ sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
+
parser = create_parser()
args = parser.parse_args()
#! /usr/bin/env python
from __future__ import print_function
+
+import os
import sys
+import inspect
+import warnings
+
from collections import Counter
# hack for python2/3 compatibility
if __name__ == "__main__":
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+ newdir = os.path.join(currentdir, 'subword_nmt')
+ if os.path.isdir(newdir):
+ warnings.simplefilter('default')
+ warnings.warn(
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+ DeprecationWarning
+ )
+
# python 2/3 compatibility
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
from __future__ import unicode_literals
+import os
import sys
+import inspect
import codecs
import re
import copy
import argparse
+import warnings
from collections import defaultdict, Counter
# hack for python2/3 compatibility
if __name__ == '__main__':
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+ newdir = os.path.join(currentdir, 'subword_nmt')
+ if os.path.isdir(newdir):
+ warnings.simplefilter('default')
+ warnings.warn(
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+ DeprecationWarning
+ )
+
# python 2/3 compatibility
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
import sys
import os
+import inspect
import codecs
import argparse
import tempfile
+import warnings
from collections import Counter
import learn_bpe
if __name__ == '__main__':
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+ newdir = os.path.join(currentdir, 'subword_nmt')
+ if os.path.isdir(newdir):
+ warnings.simplefilter('default')
+ warnings.warn(
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+ DeprecationWarning
+ )
+
# python 2/3 compatibility
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)