Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| import sys | |
| from os.path import dirname, abspath | |
| sys.path.append(dirname(dirname(abspath(__file__)))) | |
| from nose.tools import raises | |
| from torchmoji.word_generator import WordGenerator | |
| IS_PYTHON2 = int(sys.version[0]) == 2 | |
| def test_only_unicode_accepted(): | |
| """ Non-Unicode strings raise a ValueError. | |
| In Python 3 all string are Unicode | |
| """ | |
| if not IS_PYTHON2: | |
| raise ValueError("You are using python 3 so this test should always pass") | |
| sentences = [ | |
| u'Hello world', | |
| u'I am unicode', | |
| 'I am not unicode', | |
| ] | |
| wg = WordGenerator(sentences) | |
| for w in wg: | |
| pass | |
| def test_unicode_sentences_ignored_if_set(): | |
| """ Strings with Unicode characters tokenize to empty array if they're not allowed. | |
| """ | |
| sentence = [u'Dobrý den, jak se máš?'] | |
| wg = WordGenerator(sentence, allow_unicode_text=False) | |
| assert wg.get_words(sentence[0]) == [] | |
| def test_check_ascii(): | |
| """ check_ascii recognises ASCII words properly. | |
| In Python 3 all string are Unicode | |
| """ | |
| if not IS_PYTHON2: | |
| return | |
| wg = WordGenerator([]) | |
| assert wg.check_ascii('ASCII') | |
| assert not wg.check_ascii('ščřžýá') | |
| assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢') | |
| def test_convert_unicode_word(): | |
| """ convert_unicode_word converts Unicode words correctly. | |
| """ | |
| wg = WordGenerator([], allow_unicode_text=True) | |
| result = wg.convert_unicode_word(u'č') | |
| assert result == (True, u'\u010d'), '{}'.format(result) | |
| def test_convert_unicode_word_ignores_if_set(): | |
| """ convert_unicode_word ignores Unicode words if set. | |
| """ | |
| wg = WordGenerator([], allow_unicode_text=False) | |
| result = wg.convert_unicode_word(u'č') | |
| assert result == (False, ''), '{}'.format(result) | |
| def test_convert_unicode_chars(): | |
| """ convert_unicode_word correctly converts accented characters. | |
| """ | |
| wg = WordGenerator([], allow_unicode_text=True) | |
| result = wg.convert_unicode_word(u'ěščřžýáíé') | |
| assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result) | |