Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	uploaded all the files
Browse files- app.py +41 -0
- apps/demo.py +113 -0
- apps/eda.py +5 -0
- apps/home.py +12 -0
- apps/models.py +6 -0
- data/labels.txt +22 -0
- data/sample.txt +1 -0
- models/distilbert/config.json +69 -0
- models/distilbert/special_tokens_map.json +7 -0
- models/distilbert/tf_model.h5 +3 -0
- models/distilbert/tf_model.preproc +0 -0
- models/distilbert/tokenizer.json +0 -0
- models/distilbert/tokenizer_config.json +14 -0
- models/distilbert/vocab.txt +0 -0
- requirements.txt +16 -0
- utils.py +180 -0
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,41 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
            st.set_page_config(layout="wide")
         | 
| 3 | 
            +
            from streamlit_option_menu import option_menu
         | 
| 4 | 
            +
            from apps import home,eda,models,demo
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            if not "valid_inputs_received" in st.session_state:
         | 
| 11 | 
            +
                st.session_state["valid_inputs_received"] = False
         | 
| 12 | 
            +
            # image = Image.open('data/logo.png')
         | 
| 13 | 
            +
            # image=image.resize((100,100))
         | 
| 14 | 
            +
            header = st.container()
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            apps = [
         | 
| 17 | 
            +
                {"func": home.app, "title": "Home", "icon": "house"},
         | 
| 18 | 
            +
                {"func": eda.app, "title": "EDA", "icon": "bar-chart"},
         | 
| 19 | 
            +
                # {"func": models.app, "title": "Models", "icon": "cpu"},
         | 
| 20 | 
            +
                {"func": demo.app, "title": "Demo", "icon": "cloud-upload"},
         | 
| 21 | 
            +
            ]
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            titles = [app["title"] for app in apps]
         | 
| 24 | 
            +
            titles_lower = [title.lower() for title in titles]
         | 
| 25 | 
            +
            icons = [app["icon"] for app in apps]
         | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
             | 
| 29 | 
            +
            with st.sidebar:
         | 
| 30 | 
            +
                # logo = st.image(image)
         | 
| 31 | 
            +
                selected = option_menu(
         | 
| 32 | 
            +
                    "Main Menu",
         | 
| 33 | 
            +
                    options=titles,
         | 
| 34 | 
            +
                    icons=icons,
         | 
| 35 | 
            +
                    menu_icon="cast",
         | 
| 36 | 
            +
                )
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            for app in apps:
         | 
| 39 | 
            +
                if app["title"] == selected:
         | 
| 40 | 
            +
                    app["func"]()
         | 
| 41 | 
            +
                    break
         | 
    	
        apps/demo.py
    ADDED
    
    | @@ -0,0 +1,113 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
            from streamlit_echarts import st_echarts
         | 
| 3 | 
            +
            import pandas as pd
         | 
| 4 | 
            +
            from annotated_text import annotated_text, annotation
         | 
| 5 | 
            +
            from utils import load_skill_extractor, create_ann_list
         | 
| 6 | 
            +
            from utils import get_skill, clean_text, default_text, predict_cat, load_model
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            model = load_model()
         | 
| 10 | 
            +
            # skill_extractor = load_skill_extractor()
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            def app():
         | 
| 13 | 
            +
                st.title("Demo")
         | 
| 14 | 
            +
                with st.form(key="text_val"):
         | 
| 15 | 
            +
                    input_text = st.text_area('Enter the text here', value=default_text(), height=200)
         | 
| 16 | 
            +
                    submit_button = st.form_submit_button(label="Submit")
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                cls_text = clean_text(input_text)
         | 
| 19 | 
            +
                # st.write(cls_text)
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                col1, col2,= st.columns(2)
         | 
| 22 | 
            +
                gaugeData = [
         | 
| 23 | 
            +
                {
         | 
| 24 | 
            +
                    "value": 0,
         | 
| 25 | 
            +
                    "name": 'Probabiltiy',
         | 
| 26 | 
            +
                    "detail": {
         | 
| 27 | 
            +
                    "valueAnimation": True,
         | 
| 28 | 
            +
                    "offsetCenter": ['0%', '0%']
         | 
| 29 | 
            +
                    }
         | 
| 30 | 
            +
                }]
         | 
| 31 | 
            +
                option = {
         | 
| 32 | 
            +
                "series": [
         | 
| 33 | 
            +
                    {
         | 
| 34 | 
            +
                    "type": "gauge",
         | 
| 35 | 
            +
                    "startAngle": 90,
         | 
| 36 | 
            +
                    "endAngle": -270,
         | 
| 37 | 
            +
                    "pointer": {
         | 
| 38 | 
            +
                        "show": False,
         | 
| 39 | 
            +
                    },
         | 
| 40 | 
            +
                    "progress": {
         | 
| 41 | 
            +
                        "show": True,
         | 
| 42 | 
            +
                        "overlap": False,
         | 
| 43 | 
            +
                        "roundCap":False,
         | 
| 44 | 
            +
                        "clip": False,
         | 
| 45 | 
            +
                        "backgroundColor": '#11D1F9',
         | 
| 46 | 
            +
                        "itemStyle": {
         | 
| 47 | 
            +
                        "color": '#E96605', 
         | 
| 48 | 
            +
                        "borderWidth": 0,
         | 
| 49 | 
            +
                        "borderColor": "light blue"
         | 
| 50 | 
            +
                        }
         | 
| 51 | 
            +
                    },
         | 
| 52 | 
            +
                    "axisLine": {
         | 
| 53 | 
            +
                        "lineStyle": {
         | 
| 54 | 
            +
                        "width": 40
         | 
| 55 | 
            +
                        }
         | 
| 56 | 
            +
                    },
         | 
| 57 | 
            +
                    "splitLine": {
         | 
| 58 | 
            +
                        "show": False,
         | 
| 59 | 
            +
                        "distance": 0,
         | 
| 60 | 
            +
                        "length": 20
         | 
| 61 | 
            +
                    },
         | 
| 62 | 
            +
                    "axisTick": {
         | 
| 63 | 
            +
                        "show": False
         | 
| 64 | 
            +
                    },
         | 
| 65 | 
            +
                    "axisLabel": {
         | 
| 66 | 
            +
                        "show": False,
         | 
| 67 | 
            +
                        "distance": 50
         | 
| 68 | 
            +
                    },
         | 
| 69 | 
            +
                    "data": gaugeData,
         | 
| 70 | 
            +
                    "detail": {
         | 
| 71 | 
            +
                        "valueAnimation": True,
         | 
| 72 | 
            +
                        "offsetCenter": ['0%', '0%'],
         | 
| 73 | 
            +
                        "width": 40,
         | 
| 74 | 
            +
                        "height": 14,
         | 
| 75 | 
            +
                        "fontSize": 24,
         | 
| 76 | 
            +
                        "color": 'inherit',
         | 
| 77 | 
            +
                        "borderColor": 'inherit',
         | 
| 78 | 
            +
                        "borderRadius": 0,
         | 
| 79 | 
            +
                        "borderWidth": 0,
         | 
| 80 | 
            +
                        "formatter": '{value}%'
         | 
| 81 | 
            +
                    },
         | 
| 82 | 
            +
                    }
         | 
| 83 | 
            +
                ]
         | 
| 84 | 
            +
                }
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                prob,job_cat = predict_cat(model, cls_text)
         | 
| 87 | 
            +
                
         | 
| 88 | 
            +
             | 
| 89 | 
            +
             | 
| 90 | 
            +
                with st.form(key='result'):
         | 
| 91 | 
            +
                    if submit_button:
         | 
| 92 | 
            +
                        gaugeData[0]['value']=prob
         | 
| 93 | 
            +
                        with col1:
         | 
| 94 | 
            +
                            st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Job Category</h1>", unsafe_allow_html=True)
         | 
| 95 | 
            +
                            html_str = f"""
         | 
| 96 | 
            +
                                        <h1    
         | 
| 97 | 
            +
                                        style = 'text-align: center;
         | 
| 98 | 
            +
                                                font: bold {2}em Courier;'
         | 
| 99 | 
            +
                                        <p class="a">{job_cat}</p>
         | 
| 100 | 
            +
                                        </h1>
         | 
| 101 | 
            +
                                        """
         | 
| 102 | 
            +
                            
         | 
| 103 | 
            +
                            
         | 
| 104 | 
            +
                            st.markdown(html_str, unsafe_allow_html=True )
         | 
| 105 | 
            +
                        with col2:
         | 
| 106 | 
            +
                            # st.title("Probability")
         | 
| 107 | 
            +
                            
         | 
| 108 | 
            +
                            st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Probability</h1>", unsafe_allow_html=True)
         | 
| 109 | 
            +
                            st_echarts(options=option, key="1")
         | 
| 110 | 
            +
                
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                
         | 
| 113 | 
            +
                  
         | 
    	
        apps/eda.py
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            def app():
         | 
| 5 | 
            +
                st.title("EDA")
         | 
    	
        apps/home.py
    ADDED
    
    | @@ -0,0 +1,12 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            def app():
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                st.title("Ghana - Understanding The Disconnect between Skills and Jobs")
         | 
| 7 | 
            +
                
         | 
| 8 | 
            +
                st.markdown('''
         | 
| 9 | 
            +
                In Africa, there is often a disconnection between the skills that job seekers possess and the skills that employers require. This can be due to a lack of access to education and training opportunities, as well as a lack of alignment between the education system and the needs of the job market.
         | 
| 10 | 
            +
             Additionally, many employers in Africa may not have the resources or capacity to provide the necessary training and development for their employees. As a result, there is often a mismatch between the skills that workers have and the skills that employers need, which can make it difficult for workers to find employment or for employers to find qualified candidates.
         | 
| 11 | 
            +
                ''')
         | 
| 12 | 
            +
             | 
    	
        apps/models.py
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
             | 
| 5 | 
            +
            def app():
         | 
| 6 | 
            +
                st.title("Models")
         | 
    	
        data/labels.txt
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ,0
         | 
| 2 | 
            +
            Accounting/Finance/Banking,0
         | 
| 3 | 
            +
            Administrative/Secretarial ,1
         | 
| 4 | 
            +
            Advertising/Media ,2
         | 
| 5 | 
            +
            Agricultural ,3
         | 
| 6 | 
            +
            Communication ,4
         | 
| 7 | 
            +
            Customer Service ,5
         | 
| 8 | 
            +
            Education ,6
         | 
| 9 | 
            +
            "Energy,Oil & Gas ",7
         | 
| 10 | 
            +
            Engineering/Processing/Manufacturing,8
         | 
| 11 | 
            +
            Healthcare ,9
         | 
| 12 | 
            +
            Hospitalilty/Food Service ,10
         | 
| 13 | 
            +
            Human Resource Management ,11
         | 
| 14 | 
            +
            IT ,12
         | 
| 15 | 
            +
            Legal ,13
         | 
| 16 | 
            +
            Mining ,14
         | 
| 17 | 
            +
            Other,15
         | 
| 18 | 
            +
            Purchasing/Procurement ,16
         | 
| 19 | 
            +
            Quality Control/Assurance ,17
         | 
| 20 | 
            +
            Sales / Marketing ,18
         | 
| 21 | 
            +
            Securtity/Law Enforcement ,19
         | 
| 22 | 
            +
            Supply Chain/Logistics/Warehousing ,20
         | 
    	
        data/sample.txt
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            An introduction to computers and how they work. Types and Historical Development of Computers. Number systems: binary, octal, hexadecimal, integer and fractional representations, Signed and Unsigned numbers, 1‘s complement, 2‘s complement and Arithmetic Overflows. Integer and floating point arithmetic (IEEE standard 754 Floating point Formats). Data Representation and Manipulation: Bits, bytes and words: Logic operations and Logic gates applications.
         | 
    	
        models/distilbert/config.json
    ADDED
    
    | @@ -0,0 +1,69 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "/tmp/tmpqaytn67y",
         | 
| 3 | 
            +
              "activation": "gelu",
         | 
| 4 | 
            +
              "architectures": [
         | 
| 5 | 
            +
                "DistilBertForSequenceClassification"
         | 
| 6 | 
            +
              ],
         | 
| 7 | 
            +
              "attention_dropout": 0.1,
         | 
| 8 | 
            +
              "dim": 768,
         | 
| 9 | 
            +
              "dropout": 0.1,
         | 
| 10 | 
            +
              "hidden_dim": 3072,
         | 
| 11 | 
            +
              "id2label": {
         | 
| 12 | 
            +
                "0": "LABEL_0",
         | 
| 13 | 
            +
                "1": "LABEL_1",
         | 
| 14 | 
            +
                "2": "LABEL_2",
         | 
| 15 | 
            +
                "3": "LABEL_3",
         | 
| 16 | 
            +
                "4": "LABEL_4",
         | 
| 17 | 
            +
                "5": "LABEL_5",
         | 
| 18 | 
            +
                "6": "LABEL_6",
         | 
| 19 | 
            +
                "7": "LABEL_7",
         | 
| 20 | 
            +
                "8": "LABEL_8",
         | 
| 21 | 
            +
                "9": "LABEL_9",
         | 
| 22 | 
            +
                "10": "LABEL_10",
         | 
| 23 | 
            +
                "11": "LABEL_11",
         | 
| 24 | 
            +
                "12": "LABEL_12",
         | 
| 25 | 
            +
                "13": "LABEL_13",
         | 
| 26 | 
            +
                "14": "LABEL_14",
         | 
| 27 | 
            +
                "15": "LABEL_15",
         | 
| 28 | 
            +
                "16": "LABEL_16",
         | 
| 29 | 
            +
                "17": "LABEL_17",
         | 
| 30 | 
            +
                "18": "LABEL_18",
         | 
| 31 | 
            +
                "19": "LABEL_19",
         | 
| 32 | 
            +
                "20": "LABEL_20"
         | 
| 33 | 
            +
              },
         | 
| 34 | 
            +
              "initializer_range": 0.02,
         | 
| 35 | 
            +
              "label2id": {
         | 
| 36 | 
            +
                "LABEL_0": 0,
         | 
| 37 | 
            +
                "LABEL_1": 1,
         | 
| 38 | 
            +
                "LABEL_10": 10,
         | 
| 39 | 
            +
                "LABEL_11": 11,
         | 
| 40 | 
            +
                "LABEL_12": 12,
         | 
| 41 | 
            +
                "LABEL_13": 13,
         | 
| 42 | 
            +
                "LABEL_14": 14,
         | 
| 43 | 
            +
                "LABEL_15": 15,
         | 
| 44 | 
            +
                "LABEL_16": 16,
         | 
| 45 | 
            +
                "LABEL_17": 17,
         | 
| 46 | 
            +
                "LABEL_18": 18,
         | 
| 47 | 
            +
                "LABEL_19": 19,
         | 
| 48 | 
            +
                "LABEL_2": 2,
         | 
| 49 | 
            +
                "LABEL_20": 20,
         | 
| 50 | 
            +
                "LABEL_3": 3,
         | 
| 51 | 
            +
                "LABEL_4": 4,
         | 
| 52 | 
            +
                "LABEL_5": 5,
         | 
| 53 | 
            +
                "LABEL_6": 6,
         | 
| 54 | 
            +
                "LABEL_7": 7,
         | 
| 55 | 
            +
                "LABEL_8": 8,
         | 
| 56 | 
            +
                "LABEL_9": 9
         | 
| 57 | 
            +
              },
         | 
| 58 | 
            +
              "max_position_embeddings": 512,
         | 
| 59 | 
            +
              "model_type": "distilbert",
         | 
| 60 | 
            +
              "n_heads": 12,
         | 
| 61 | 
            +
              "n_layers": 6,
         | 
| 62 | 
            +
              "pad_token_id": 0,
         | 
| 63 | 
            +
              "qa_dropout": 0.1,
         | 
| 64 | 
            +
              "seq_classif_dropout": 0.2,
         | 
| 65 | 
            +
              "sinusoidal_pos_embds": false,
         | 
| 66 | 
            +
              "tie_weights_": true,
         | 
| 67 | 
            +
              "transformers_version": "4.26.0",
         | 
| 68 | 
            +
              "vocab_size": 30522
         | 
| 69 | 
            +
            }
         | 
    	
        models/distilbert/special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "cls_token": "[CLS]",
         | 
| 3 | 
            +
              "mask_token": "[MASK]",
         | 
| 4 | 
            +
              "pad_token": "[PAD]",
         | 
| 5 | 
            +
              "sep_token": "[SEP]",
         | 
| 6 | 
            +
              "unk_token": "[UNK]"
         | 
| 7 | 
            +
            }
         | 
    	
        models/distilbert/tf_model.h5
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2f5ddf0f86b178c98159599cedfbad2b01062ed3290d73f0af7780480ba1106c
         | 
| 3 | 
            +
            size 268014760
         | 
    	
        models/distilbert/tf_model.preproc
    ADDED
    
    | Binary file (2.98 kB). View file | 
|  | 
    	
        models/distilbert/tokenizer.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        models/distilbert/tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,14 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "cls_token": "[CLS]",
         | 
| 3 | 
            +
              "do_lower_case": true,
         | 
| 4 | 
            +
              "mask_token": "[MASK]",
         | 
| 5 | 
            +
              "model_max_length": 512,
         | 
| 6 | 
            +
              "name_or_path": "distilbert-base-uncased",
         | 
| 7 | 
            +
              "pad_token": "[PAD]",
         | 
| 8 | 
            +
              "sep_token": "[SEP]",
         | 
| 9 | 
            +
              "special_tokens_map_file": null,
         | 
| 10 | 
            +
              "strip_accents": null,
         | 
| 11 | 
            +
              "tokenize_chinese_chars": true,
         | 
| 12 | 
            +
              "tokenizer_class": "DistilBertTokenizer",
         | 
| 13 | 
            +
              "unk_token": "[UNK]"
         | 
| 14 | 
            +
            }
         | 
    	
        models/distilbert/vocab.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            streamlit
         | 
| 2 | 
            +
            streamlit-option-menu
         | 
| 3 | 
            +
            streamlit-echarts
         | 
| 4 | 
            +
            ktrain
         | 
| 5 | 
            +
            # spacy>=3.0.0,<4.0.0
         | 
| 6 | 
            +
            # en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl
         | 
| 7 | 
            +
            # skillNer
         | 
| 8 | 
            +
            pandas
         | 
| 9 | 
            +
            numpy
         | 
| 10 | 
            +
            ipython
         | 
| 11 | 
            +
            neattext
         | 
| 12 | 
            +
            tensorflow
         | 
| 13 | 
            +
            st_annotated_text==2.0.0
         | 
| 14 | 
            +
            requests
         | 
| 15 | 
            +
            # nltk
         | 
| 16 | 
            +
            # Unidecode
         | 
    	
        utils.py
    ADDED
    
    | @@ -0,0 +1,180 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import sys
         | 
| 2 | 
            +
            import subprocess
         | 
| 3 | 
            +
            import streamlit as st
         | 
| 4 | 
            +
            import numpy as np
         | 
| 5 | 
            +
            from annotated_text import annotation
         | 
| 6 | 
            +
            import collections
         | 
| 7 | 
            +
            import ktrain
         | 
| 8 | 
            +
            import pandas as pd
         | 
| 9 | 
            +
            import os
         | 
| 10 | 
            +
            import neattext.functions as nfx
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            label_path = ("./data/labels.txt")
         | 
| 14 | 
            +
            cols = ['cat', 'code']
         | 
| 15 | 
            +
            label_df = pd.read_csv(label_path, names=cols, header=0)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             | 
| 18 | 
            +
            def default_text():
         | 
| 19 | 
            +
                with open("./data/sample.txt", 'r') as fs:
         | 
| 20 | 
            +
                    text = fs.read()
         | 
| 21 | 
            +
                return text
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            @st.cache(allow_output_mutation=True,suppress_st_warning=True)
         | 
| 24 | 
            +
            def load_model():
         | 
| 25 | 
            +
                model_path = "./models/distilbert/"
         | 
| 26 | 
            +
                model = ktrain.load_predictor(model_path)
         | 
| 27 | 
            +
                return model
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            @st.cache(allow_output_mutation=True, suppress_st_warning=True)
         | 
| 30 | 
            +
            def load_skill_extractor():
         | 
| 31 | 
            +
                # This function will only be run the first time it's called
         | 
| 32 | 
            +
                import spacy
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                from skillNer.skill_extractor_class import SkillExtractor
         | 
| 35 | 
            +
                from skillNer.general_params import SKILL_DB
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                from spacy.matcher import PhraseMatcher
         | 
| 38 | 
            +
                # init params of skill extractor
         | 
| 39 | 
            +
                print('load model')
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                nlp = spacy.load('en_core_web_lg')
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                print('load matcher')
         | 
| 44 | 
            +
                # init skill extractor
         | 
| 45 | 
            +
                skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
         | 
| 46 | 
            +
                return skill_extractor
         | 
| 47 | 
            +
             | 
| 48 | 
            +
             | 
| 49 | 
            +
             | 
| 50 | 
            +
            def clean_text(text):
         | 
| 51 | 
            +
                try:
         | 
| 52 | 
            +
                    docx = nfx.TextFrame(text)
         | 
| 53 | 
            +
                    result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
         | 
| 54 | 
            +
                    # doc = nlp(result.text)
         | 
| 55 | 
            +
                    # empty_list = []
         | 
| 56 | 
            +
                    # for token in doc:
         | 
| 57 | 
            +
                    # empty_list.append(token.lemma_)
         | 
| 58 | 
            +
                    # final_string = ' '.join(map(str,empty_list))
         | 
| 59 | 
            +
                    return result.text
         | 
| 60 | 
            +
                except Exception as e:
         | 
| 61 | 
            +
                    print(e)
         | 
| 62 | 
            +
                    return None
         | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
            def predict_cat(model, text):
         | 
| 66 | 
            +
                p = int(model.predict(text,return_proba=True).max()*100)
         | 
| 67 | 
            +
                cat =  model.predict(text)
         | 
| 68 | 
            +
                return p,cat
         | 
| 69 | 
            +
             | 
| 70 | 
            +
             | 
| 71 | 
            +
            def grouper(iterable):
         | 
| 72 | 
            +
                prev = None
         | 
| 73 | 
            +
                group = []
         | 
| 74 | 
            +
                for item in iterable:
         | 
| 75 | 
            +
                    if not prev or item - prev <= 1:
         | 
| 76 | 
            +
                        group.append(item)
         | 
| 77 | 
            +
                    else:
         | 
| 78 | 
            +
                        yield group
         | 
| 79 | 
            +
                        group = [item]
         | 
| 80 | 
            +
                    prev = item
         | 
| 81 | 
            +
                if group:
         | 
| 82 | 
            +
                    yield group
         | 
| 83 | 
            +
             | 
| 84 | 
            +
             | 
| 85 | 
            +
            def get_skill(annotations):
         | 
| 86 | 
            +
                try:
         | 
| 87 | 
            +
                    # annotations = skill_extractor.annotate(text,tresh=0.5)
         | 
| 88 | 
            +
                    # skill_dict = {"Soft Skill": [], "Hard Skill": []}
         | 
| 89 | 
            +
                    soft_skill = []
         | 
| 90 | 
            +
                    hard_skill = []
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    for item in annotations['results']['ngram_scored']:
         | 
| 93 | 
            +
                        skill_id = item['skill_id']
         | 
| 94 | 
            +
                        skill_type = skill_extractor.skills_db[skill_id]['skill_type']
         | 
| 95 | 
            +
                        if skill_type == 'Soft Skill' and item['doc_node_value']:
         | 
| 96 | 
            +
                            soft_skill.append(item['doc_node_value'])
         | 
| 97 | 
            +
                        if skill_type == 'Hard Skill':
         | 
| 98 | 
            +
                            hard_skill.append(item['doc_node_value'])  
         | 
| 99 | 
            +
                        # skill_dict['Soft Skill'] =set(soft_skill)
         | 
| 100 | 
            +
                    sk = " ".join(list(set(soft_skill)))
         | 
| 101 | 
            +
                    hk = " ".join(list(set(hard_skill)))
         | 
| 102 | 
            +
                    # st.write(skill_extractor.describe(annotations))
         | 
| 103 | 
            +
                    return sk+hk
         | 
| 104 | 
            +
                except Exception as e:
         | 
| 105 | 
            +
                    return None
         | 
| 106 | 
            +
             | 
| 107 | 
            +
             | 
| 108 | 
            +
            def install(package):
         | 
| 109 | 
            +
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
         | 
| 110 | 
            +
             | 
| 111 | 
            +
             | 
| 112 | 
            +
             | 
| 113 | 
            +
             | 
| 114 | 
            +
             | 
| 115 | 
            +
            def create_ann_list(text, results):
         | 
| 116 | 
            +
                try:
         | 
| 117 | 
            +
                    from skillNer.general_params import SKILL_DB
         | 
| 118 | 
            +
                except:
         | 
| 119 | 
            +
                    # install skillner if not done yet
         | 
| 120 | 
            +
                    os.system('pip install skillner')
         | 
| 121 | 
            +
                    from skillNer.general_params import SKILL_DB
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                type_to_color = {'Hard Skill': "#faa",
         | 
| 124 | 
            +
                                 'Soft Skill': '#afa', 'Certification': '#ff4'}
         | 
| 125 | 
            +
                text_tokens = text.split(' ')
         | 
| 126 | 
            +
                annots = {}
         | 
| 127 | 
            +
                all_res = results['ngram_scored']+results['full_matches']
         | 
| 128 | 
            +
                ids_done = []
         | 
| 129 | 
            +
                # create annotations from matches
         | 
| 130 | 
            +
                for match in all_res:
         | 
| 131 | 
            +
                    id_ = match['skill_id']
         | 
| 132 | 
            +
                    type_ = SKILL_DB[id_]['skill_type']
         | 
| 133 | 
            +
                    span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
         | 
| 134 | 
            +
                    annot = annotation(span_str, type_, background=type_to_color[type_],
         | 
| 135 | 
            +
                                       color="#333", margin='2px')
         | 
| 136 | 
            +
                    annots[match['doc_node_id'][0]] = annot
         | 
| 137 | 
            +
                    for i in match['doc_node_id']:
         | 
| 138 | 
            +
                        ids_done.append(i)
         | 
| 139 | 
            +
                # create strs for non annotated text
         | 
| 140 | 
            +
                non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
         | 
| 141 | 
            +
                dict_ = dict(enumerate(grouper(non_match_ids), 1))
         | 
| 142 | 
            +
                for v in dict_.values():
         | 
| 143 | 
            +
                    span = ' '.join([text_tokens[i] for i in v])
         | 
| 144 | 
            +
                    annots[v[0]] = span
         | 
| 145 | 
            +
                    # annotation(token,color="#fff", background="transparent",)
         | 
| 146 | 
            +
                print(dict_)
         | 
| 147 | 
            +
                print('-----')
         | 
| 148 | 
            +
                # print(collections.OrderedDict(sorted(annots.items())))
         | 
| 149 | 
            +
                annots_ = collections.OrderedDict(sorted(annots.items())).values()
         | 
| 150 | 
            +
                return annots_
         | 
| 151 | 
            +
             | 
| 152 | 
            +
             | 
| 153 | 
            +
            def create_dfs(results):
         | 
| 154 | 
            +
                try:
         | 
| 155 | 
            +
                    from skillNer.general_params import SKILL_DB
         | 
| 156 | 
            +
                except:
         | 
| 157 | 
            +
                    # install skillner if not done yet
         | 
| 158 | 
            +
                    os.system('pip install skillner')
         | 
| 159 | 
            +
                    from skillNer.general_params import SKILL_DB
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                f_matches = results['full_matches']
         | 
| 162 | 
            +
                f_arr = []
         | 
| 163 | 
            +
                for match in f_matches:
         | 
| 164 | 
            +
                    id_ = match['skill_id']
         | 
| 165 | 
            +
                    full_name = SKILL_DB[id_]['skill_name']
         | 
| 166 | 
            +
                    type_ = SKILL_DB[id_]['skill_type']
         | 
| 167 | 
            +
                    f_arr.append([id_, full_name, type_])
         | 
| 168 | 
            +
                s_matches = results['ngram_scored']
         | 
| 169 | 
            +
                s_arr = []
         | 
| 170 | 
            +
                for match in s_matches:
         | 
| 171 | 
            +
                    id_ = match['skill_id']
         | 
| 172 | 
            +
                    full_name = SKILL_DB[id_]['skill_name']
         | 
| 173 | 
            +
                    type_ = SKILL_DB[id_]['skill_type']
         | 
| 174 | 
            +
                    score = match['score']
         | 
| 175 | 
            +
                    s_arr.append([id_, full_name, type_, score])
         | 
| 176 | 
            +
                full_df = pd.DataFrame(
         | 
| 177 | 
            +
                    f_arr, columns=['skill id', 'skill name', 'skill type'])
         | 
| 178 | 
            +
                sub_df = pd.DataFrame(
         | 
| 179 | 
            +
                    s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
         | 
| 180 | 
            +
                return full_df, sub_df
         |