File size: 3,307 Bytes
6aaeb71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from typing import List
from enum import Enum
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer


PHRASE_NO_PROBLEMS = ['got food',
                      'got food and clothes',
                     'got food and covers']

KEYS_HOUSE = [
    "shelters",
    "mattresses",
    "pillows",
    "blankets",
    "shelter",
    "tentes",
    "housing",
    "couvertures",
    "tents",
    "covers",
    "sdader",
    "housing_shelter",
]
KEYS_FOOD = [
    "groceries",
    "nouriture",
    "food",
    "water",
    "gaz",
    "dishes",
    "oil",
    "sugar",
    "tea",
    "hungry",
]
KEYS_CLOTHES = [
    "clothes",
    "clothing",
    "hygiene",
]
KEYS_MEDICAL = [
    "betadine",
    "medical",
    "diabetics",
    "medicaments",
    "diabetes",
    "doliprane",
    "vitamines",
    "drugs",
]

class HelpCategory(Enum):
    HOUSE = 'house'
    FOOD = 'food'
    CLOTHES = 'clothes'
    MEDICAL = 'medical'
    UNKNOW = 'unknow'


nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

lemmatize_house = [lemmatizer.lemmatize(word) for word in KEYS_HOUSE]
lemmatize_food = [lemmatizer.lemmatize(word) for word in KEYS_FOOD]
lemmatize_clothes = [lemmatizer.lemmatize(word) for word in KEYS_CLOTHES]
lemmatize_medical = [lemmatizer.lemmatize(word) for word in KEYS_MEDICAL]

def to_category(text: str) -> List[HelpCategory]:
    if text in PHRASE_NO_PROBLEMS:
        return []

    words = text.split()
    categories = []
    for word in words:
        if word in KEYS_HOUSE:
            categories.append(HelpCategory.HOUSE)
        elif word in KEYS_FOOD:
            categories.append(HelpCategory.FOOD)
        if word in KEYS_CLOTHES:
            categories.append(HelpCategory.CLOTHES)
        if word in KEYS_MEDICAL:
            categories.append(HelpCategory.MEDICAL)
        if lemmatizer.lemmatize(word) in lemmatize_house:
            categories.append(HelpCategory.HOUSE)
        if lemmatizer.lemmatize(word) in lemmatize_food:
            categories.append(HelpCategory.FOOD)
        if lemmatizer.lemmatize(word) in lemmatize_clothes:
            categories.append(HelpCategory.CLOTHES)
        if lemmatizer.lemmatize(word) in lemmatize_medical:
            categories.append(HelpCategory.MEDICAL)
    if len(categories) == 0:
        categories = [HelpCategory.UNKNOW]
    return categories


def clean(text: str) -> str:
    text = text.replace('Housing/Shelter', 'housing_shelter')
    text = text.replace('/', ',')
    text = text.lower()
    text = text.strip()
    return text


def to_list(text: str) -> List[str]:
    helps = text.split(',')
    helps = [help_string.replace('.', ' ').strip() for help_string in helps]
    return helps


def help_text_to_help_category(helps: List[str]) -> List[str]:
    all_categories = set()
    for help_string in helps:
        categories = to_category(help_string)
        all_categories.update(categories)
    return list(all_categories)


def add_category(df:pd.DataFrame) -> pd.DataFrame:
    df['help_category'] = df['Help Details'].apply(clean).apply(to_list).apply(help_text_to_help_category)
    return df

def string_category(df:pd.DataFrame) -> pd.DataFrame:
    df['help_category'] = df['help_category'].apply(lambda x : ','.join([category.value for category in x]))
    return df