company_name_matches_using_embeddings

Runtime error

File size: 3,572 Bytes

import streamlit as st
import pandas as pd
import json
import numpy as np
from fuzzywuzzy import fuzz

import pinecone
from sentence_transformers import SentenceTransformer

pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp')

model = SentenceTransformer('all-mpnet-base-v2',device='cpu')

def process_string(s):
    return s.lower().replace('&', 'and')

def levenshtein_distance(s1, s2):
    return fuzz.ratio(s1, s2)

def compare_string_all(string, df):
    string = string.lower().replace('&', 'and')

    df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))

    top_5_df = df.sort_values('distance', ascending=False).head(5)

    top_5_df = top_5_df[['label','Ingredients', 'distance']]

    return top_5_df

def compare_string_label(string, df):
    string = string.lower().replace('&', 'and')

    df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))

    top_5_df = df.sort_values('distance', ascending=False).head(5)

    top_5_df = top_5_df[['label','Ingredients', 'distance']]

    return top_5_df

df= pd.read_json('cleaned.json')

df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']

df['cleaned_text']= df['label+ingradient'].apply(process_string)

df['cleaned_label'] = df['label'].apply(process_string)

index = pinecone.Index('companiessearch')


# Create a Streamlit app
def main():
    st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
    st.title("Company name matching App :smiley:")

    # Define pages
    pages = ["Semantic search"]

    # Add radio buttons to toggle between pages
    page = st.sidebar.radio("Select a page", pages)

    # if page == pages[0]:
    #     st.header("Matches using levenshtein_distance")
    #     st.write("Enter a menu along with its ingredients:")
    #     st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
    #     input_string = st.text_input("")

    #     input_string= process_string(input_string)

    #     if input_string:
    #         st.write("Top 5 matches:")

    #         if len(input_string.split())>4: 
    #             top_matches = compare_string_all(input_string, df)
    #         else:
    #             top_matches= compare_string_label(input_string, df)

    #         st.dataframe(top_matches)

    if page == pages[0]:
        st.header("Matches using embeddings (semantic search)")
        st.write("Enter a company name:")
        st.write("e.g. Airtel Africa Plc")
        input_string = st.text_input("")

        input_string = process_string(input_string)

        if st.button("Enter"):
            st.write("Top 5 matches using semantic search:")

            # if len(input_string.split()) > 4:
            #     top_matches = compare_string_all(input_string, df)
            # else:
            #     top_matches = compare_string_label(input_string, df)

            xq = model.encode([input_string]).tolist()
            result = index.query(xq, top_k=10, includeMetadata=True)

            Name=[]
            Country=[]
            score=[]
            for matches in result['matches']:
                Name.append(matches['metadata']['name'])
                Country.append(matches['metadata']['Country'])
                score.append(matches['score'])
            
            final_result= pd.DataFrame(list(zip(Name, Country, score)),
                columns =['Company_name', 'Country','score' ])

            st.dataframe(final_result)

if __name__ == "__main__":
    main()