from typing import List from enum import Enum import pandas as pd import nltk from nltk.stem import WordNetLemmatizer PHRASE_NO_PROBLEMS = ['got food', 'got food and clothes', 'got food and covers'] KEYS_HOUSE = [ "shelters", "mattresses", "pillows", "blankets", "shelter", "tentes", "housing", "couvertures", "tents", "covers", "sdader", "housing_shelter", ] KEYS_FOOD = [ "groceries", "nouriture", "food", "water", "gaz", "dishes", "oil", "sugar", "tea", "hungry", ] KEYS_CLOTHES = [ "clothes", "clothing", "hygiene", ] KEYS_MEDICAL = [ "betadine", "medical", "diabetics", "medicaments", "diabetes", "doliprane", "vitamines", "drugs", ] class HelpCategory(Enum): HOUSE = 'house' FOOD = 'food' CLOTHES = 'clothes' MEDICAL = 'medical' UNKNOW = 'unknow' nltk.download('wordnet') nltk.download('omw-1.4') lemmatizer = WordNetLemmatizer() lemmatize_house = [lemmatizer.lemmatize(word) for word in KEYS_HOUSE] lemmatize_food = [lemmatizer.lemmatize(word) for word in KEYS_FOOD] lemmatize_clothes = [lemmatizer.lemmatize(word) for word in KEYS_CLOTHES] lemmatize_medical = [lemmatizer.lemmatize(word) for word in KEYS_MEDICAL] def to_category(text: str) -> List[HelpCategory]: if text in PHRASE_NO_PROBLEMS: return [] words = text.split() categories = [] for word in words: if word in KEYS_HOUSE: categories.append(HelpCategory.HOUSE) elif word in KEYS_FOOD: categories.append(HelpCategory.FOOD) if word in KEYS_CLOTHES: categories.append(HelpCategory.CLOTHES) if word in KEYS_MEDICAL: categories.append(HelpCategory.MEDICAL) if lemmatizer.lemmatize(word) in lemmatize_house: categories.append(HelpCategory.HOUSE) if lemmatizer.lemmatize(word) in lemmatize_food: categories.append(HelpCategory.FOOD) if lemmatizer.lemmatize(word) in lemmatize_clothes: categories.append(HelpCategory.CLOTHES) if lemmatizer.lemmatize(word) in lemmatize_medical: categories.append(HelpCategory.MEDICAL) if len(categories) == 0: categories = [HelpCategory.UNKNOW] return categories def clean(text: str) -> str: text = text.replace('Housing/Shelter', 'housing_shelter') text = text.replace('/', ',') text = text.lower() text = text.strip() return text def to_list(text: str) -> List[str]: helps = text.split(',') helps = [help_string.replace('.', ' ').strip() for help_string in helps] return helps def help_text_to_help_category(helps: List[str]) -> List[str]: all_categories = set() for help_string in helps: categories = to_category(help_string) all_categories.update(categories) return list(all_categories) def add_category(df:pd.DataFrame) -> pd.DataFrame: df['help_category'] = df['Help Details'].apply(clean).apply(to_list).apply(help_text_to_help_category) return df def string_category(df:pd.DataFrame) -> pd.DataFrame: df['help_category'] = df['help_category'].apply(lambda x : ','.join([category.value for category in x])) return df