File size: 3,572 Bytes
819f923
 
 
 
 
 
 
 
 
5af49e7
819f923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a2d833
819f923
 
 
 
 
5af49e7
819f923
 
5a2d833
819f923
 
 
 
5a2d833
 
 
 
 
819f923
5a2d833
819f923
5a2d833
 
819f923
5a2d833
 
 
 
819f923
5a2d833
819f923
5af49e7
819f923
5a2d833
 
819f923
 
 
 
5a2d833
 
819f923
 
 
 
 
 
 
 
 
5a2d833
 
819f923
 
9a46289
5a2d833
819f923
9a46289
 
 
819f923
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import pandas as pd
import json
import numpy as np
from fuzzywuzzy import fuzz

import pinecone
from sentence_transformers import SentenceTransformer

pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp')

model = SentenceTransformer('all-mpnet-base-v2',device='cpu')

def process_string(s):
    return s.lower().replace('&', 'and')

def levenshtein_distance(s1, s2):
    return fuzz.ratio(s1, s2)

def compare_string_all(string, df):
    string = string.lower().replace('&', 'and')

    df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))

    top_5_df = df.sort_values('distance', ascending=False).head(5)

    top_5_df = top_5_df[['label','Ingredients', 'distance']]

    return top_5_df

def compare_string_label(string, df):
    string = string.lower().replace('&', 'and')

    df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))

    top_5_df = df.sort_values('distance', ascending=False).head(5)

    top_5_df = top_5_df[['label','Ingredients', 'distance']]

    return top_5_df

df= pd.read_json('cleaned.json')

df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']

df['cleaned_text']= df['label+ingradient'].apply(process_string)

df['cleaned_label'] = df['label'].apply(process_string)

index = pinecone.Index('companiessearch')


# Create a Streamlit app
def main():
    st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
    st.title("Company name matching App :smiley:")

    # Define pages
    pages = ["Semantic search"]

    # Add radio buttons to toggle between pages
    page = st.sidebar.radio("Select a page", pages)

    # if page == pages[0]:
    #     st.header("Matches using levenshtein_distance")
    #     st.write("Enter a menu along with its ingredients:")
    #     st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
    #     input_string = st.text_input("")

    #     input_string= process_string(input_string)

    #     if input_string:
    #         st.write("Top 5 matches:")

    #         if len(input_string.split())>4: 
    #             top_matches = compare_string_all(input_string, df)
    #         else:
    #             top_matches= compare_string_label(input_string, df)

    #         st.dataframe(top_matches)

    if page == pages[0]:
        st.header("Matches using embeddings (semantic search)")
        st.write("Enter a company name:")
        st.write("e.g. Airtel Africa Plc")
        input_string = st.text_input("")

        input_string = process_string(input_string)

        if st.button("Enter"):
            st.write("Top 5 matches using semantic search:")

            # if len(input_string.split()) > 4:
            #     top_matches = compare_string_all(input_string, df)
            # else:
            #     top_matches = compare_string_label(input_string, df)

            xq = model.encode([input_string]).tolist()
            result = index.query(xq, top_k=10, includeMetadata=True)

            Name=[]
            Country=[]
            score=[]
            for matches in result['matches']:
                Name.append(matches['metadata']['name'])
                Country.append(matches['metadata']['Country'])
                score.append(matches['score'])
            
            final_result= pd.DataFrame(list(zip(Name, Country, score)),
                columns =['Company_name', 'Country','score' ])

            st.dataframe(final_result)

if __name__ == "__main__":
    main()