# readability_indices.py from nltk.tokenize import sent_tokenize, word_tokenize import pyphen import re from IPython.display import display, HTML def count_syllables(word, lang): if lang == 'kk': # Используем простой алгоритм для казахского языка word = word.lower() vowels = "аеёиоуыэюяіүұөө" syllables = sum(1 for char in word if char in vowels) return max(1, syllables) else: # Для русского и английского используем Pyphen dic = pyphen.Pyphen(lang=lang) hyphens = dic.inserted(word) return max(1, hyphens.count('-') + 1) # Функции для определения сложных слов def is_complex_word(word, lang, syllable_threshold=3): syllables = count_syllables(word, lang) return syllables >= syllable_threshold # Функции для расчёта индексов удобочитаемости def flesch_reading_ease(text, lang): sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english') words = word_tokenize(text, language='russian' if lang == 'ru' else 'english') words = [word for word in words if word.isalpha()] num_sentences = max(1, len(sentences)) num_words = max(1, len(words)) syllable_count = sum([count_syllables(word, lang) for word in words]) asl = num_words / num_sentences # Средняя длина предложения asw = syllable_count / num_words # Среднее количество слогов в слове if lang == 'ru': fre = 206.835 - (1.3 * asl) - (60.1 * asw) elif lang == 'en': fre = 206.835 - (1.015 * asl) - (84.6 * asw) elif lang == 'kk': # Предположительные коэффициенты для казахского языка fre = 206.835 - (1.2 * asl) - (70 * asw) else: fre = 0 return fre def flesch_kincaid_grade_level(text, lang): sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english') words = word_tokenize(text, language='russian' if lang == 'ru' else 'english') words = [word for word in words if word.isalpha()] num_sentences = max(1, len(sentences)) num_words = max(1, len(words)) syllable_count = sum([count_syllables(word, lang) for word in words]) asl = num_words / num_sentences asw = syllable_count / num_words if lang == 'ru': fkgl = (0.5 * asl) + (8.4 * asw) - 15.59 elif lang == 'en': fkgl = (0.39 * asl) + (11.8 * asw) - 15.59 elif lang == 'kk': fkgl = (0.5 * asl) + (9 * asw) - 13 else: fkgl = 0 return fkgl def gunning_fog_index(text, lang): sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english') words = word_tokenize(text, language='russian' if lang == 'ru' else 'english') words = [word for word in words if word.isalpha()] num_sentences = max(1, len(sentences)) num_words = max(1, len(words)) complex_words = [word for word in words if is_complex_word(word, lang)] percentage_complex = (len(complex_words) / num_words) * 100 asl = num_words / num_sentences fog_index = 0.4 * (asl + percentage_complex) return fog_index def smog_index(text, lang): sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english') words = word_tokenize(text, language='russian' if lang == 'ru' else 'english') words = [word for word in words if word.isalpha()] num_sentences = len(sentences) complex_words = [word for word in words if is_complex_word(word, lang)] num_complex = len(complex_words) if num_sentences >= 3: smog = 1.0430 * ((num_complex * (30 / num_sentences)) ** 0.5) + 3.1291 else: smog = 0 return smog # Функция для выделения сложных слов и предложений def highlight_complex_text(text, lang): sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english') highlighted_sentences = [] complex_words_list = [] for sentence in sentences: words = word_tokenize(sentence, language='russian' if lang == 'ru' else 'english') words_filtered = [word for word in words if word.isalpha()] complex_words = [word for word in words_filtered if is_complex_word(word, lang)] complex_words_list.extend(complex_words) if len(words_filtered) > 0 and (len(complex_words) / len(words_filtered)) > 0.3: highlighted_sentence = f"{sentence}" else: highlighted_sentence = sentence for word in complex_words: highlighted_sentence = re.sub(r'\b{}\b'.format(re.escape(word)), f"{word}", highlighted_sentence) highlighted_sentences.append(highlighted_sentence) highlighted_text = ' '.join(highlighted_sentences) return highlighted_text, complex_words_list # Основная функция def analyze_text(text, lang_code): if lang_code not in ['ru', 'en', 'kk']: print('Unsupported language code. Please use "ru" for Russian, "en" for English, or "kk" for Kazakh.') return fre = flesch_reading_ease(text, lang_code) fkgl = flesch_kincaid_grade_level(text, lang_code) fog = gunning_fog_index(text, lang_code) smog = smog_index(text, lang_code) highlighted_text, complex_words = highlight_complex_text(text, lang_code) # Вывод результатов print(f"Язык: {'Русский' if lang_code == 'ru' else 'Английский' if lang_code == 'en' else 'Казахский'}") print(f"Индекс удобочитаемости Флеша: {fre:.2f}") print(f"Индекс Флеша-Кинкейда: {fkgl:.2f}") print(f"Индекс тумана Ганнинга: {fog:.2f}") print(f"Индекс SMOG: {smog:.2f}") print("\nСложные слова:") print(', '.join(set(complex_words))) print("\nТекст с выделениями:") display(HTML(highlighted_text))