from fugashi import GenericTagger, Tagger, build_dictionary import sys import fileinput def main(): """ This is a simple wrapper for fugashi so you can test it from the command line. Like the mecab binary, it treats each line of stdin as one sentence. You can pass tagger arguments here too. """ args = ' '.join(sys.argv[1:]) # This should work if you specify a different dictionary, # but it should also work with the pip unidic. # Try the GenericTagger and then try the Unidic tagger. try: tagger = GenericTagger(args, quiet=True) except RuntimeError: tagger = Tagger(args) for line in fileinput.input([]): print(tagger.parse(line.strip())) def info(): """Print configuration info.""" args = ' '.join(sys.argv[1:]) try: tagger = GenericTagger(args, quiet=True) except RuntimeError: tagger = Tagger(args) #TODO get the fugashi version here too print("Fugashi dictionary info:") print("-----") for di in tagger.dictionary_info: for field in 'version size charset filename'.split(): print( (field + ':').ljust(10), di[field]) print('-----') def build_dict(): """EXPERIMENTAL A wrapper for MeCab's user dictionary building command. This also defaults to utf8. """ # TODO simplify using pip-installed dictionaries as base args = sys.argv[0] + " -f utf8 -t utf8 " + ' '.join(sys.argv[1:]) print(args) build_dictionary(args)