Spaces:
Runtime error
Runtime error
from typing import List | |
from enum import Enum | |
import pandas as pd | |
import nltk | |
from nltk.stem import WordNetLemmatizer | |
PHRASE_NO_PROBLEMS = ['got food', | |
'got food and clothes', | |
'got food and covers'] | |
KEYS_HOUSE = [ | |
"shelters", | |
"mattresses", | |
"pillows", | |
"blankets", | |
"shelter", | |
"tentes", | |
"housing", | |
"couvertures", | |
"tents", | |
"covers", | |
"sdader", | |
"housing_shelter", | |
] | |
KEYS_FOOD = [ | |
"groceries", | |
"nouriture", | |
"food", | |
"water", | |
"gaz", | |
"dishes", | |
"oil", | |
"sugar", | |
"tea", | |
"hungry", | |
] | |
KEYS_CLOTHES = [ | |
"clothes", | |
"clothing", | |
"hygiene", | |
] | |
KEYS_MEDICAL = [ | |
"betadine", | |
"medical", | |
"diabetics", | |
"medicaments", | |
"diabetes", | |
"doliprane", | |
"vitamines", | |
"drugs", | |
] | |
class HelpCategory(Enum): | |
HOUSE = 'house' | |
FOOD = 'food' | |
CLOTHES = 'clothes' | |
MEDICAL = 'medical' | |
UNKNOW = 'unknow' | |
nltk.download('wordnet') | |
nltk.download('omw-1.4') | |
lemmatizer = WordNetLemmatizer() | |
lemmatize_house = [lemmatizer.lemmatize(word) for word in KEYS_HOUSE] | |
lemmatize_food = [lemmatizer.lemmatize(word) for word in KEYS_FOOD] | |
lemmatize_clothes = [lemmatizer.lemmatize(word) for word in KEYS_CLOTHES] | |
lemmatize_medical = [lemmatizer.lemmatize(word) for word in KEYS_MEDICAL] | |
def to_category(text: str) -> List[HelpCategory]: | |
if text in PHRASE_NO_PROBLEMS: | |
return [] | |
words = text.split() | |
categories = [] | |
for word in words: | |
if word in KEYS_HOUSE: | |
categories.append(HelpCategory.HOUSE) | |
elif word in KEYS_FOOD: | |
categories.append(HelpCategory.FOOD) | |
if word in KEYS_CLOTHES: | |
categories.append(HelpCategory.CLOTHES) | |
if word in KEYS_MEDICAL: | |
categories.append(HelpCategory.MEDICAL) | |
if lemmatizer.lemmatize(word) in lemmatize_house: | |
categories.append(HelpCategory.HOUSE) | |
if lemmatizer.lemmatize(word) in lemmatize_food: | |
categories.append(HelpCategory.FOOD) | |
if lemmatizer.lemmatize(word) in lemmatize_clothes: | |
categories.append(HelpCategory.CLOTHES) | |
if lemmatizer.lemmatize(word) in lemmatize_medical: | |
categories.append(HelpCategory.MEDICAL) | |
if len(categories) == 0: | |
categories = [HelpCategory.UNKNOW] | |
return categories | |
def clean(text: str) -> str: | |
text = text.replace('Housing/Shelter', 'housing_shelter') | |
text = text.replace('/', ',') | |
text = text.lower() | |
text = text.strip() | |
return text | |
def to_list(text: str) -> List[str]: | |
helps = text.split(',') | |
helps = [help_string.replace('.', ' ').strip() for help_string in helps] | |
return helps | |
def help_text_to_help_category(helps: List[str]) -> List[str]: | |
all_categories = set() | |
for help_string in helps: | |
categories = to_category(help_string) | |
all_categories.update(categories) | |
return list(all_categories) | |
def add_category(df:pd.DataFrame) -> pd.DataFrame: | |
df['help_category'] = df['Help Details'].apply(clean).apply(to_list).apply(help_text_to_help_category) | |
return df | |
def string_category(df:pd.DataFrame) -> pd.DataFrame: | |
df['help_category'] = df['help_category'].apply(lambda x : ','.join([category.value for category in x])) | |
return df | |