Spaces:
Runtime error
Runtime error
File size: 3,572 Bytes
819f923 5af49e7 819f923 5a2d833 819f923 5af49e7 819f923 5a2d833 819f923 5a2d833 819f923 5a2d833 819f923 5a2d833 819f923 5a2d833 819f923 5a2d833 819f923 5af49e7 819f923 5a2d833 819f923 5a2d833 819f923 5a2d833 819f923 9a46289 5a2d833 819f923 9a46289 819f923 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import streamlit as st
import pandas as pd
import json
import numpy as np
from fuzzywuzzy import fuzz
import pinecone
from sentence_transformers import SentenceTransformer
pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp')
model = SentenceTransformer('all-mpnet-base-v2',device='cpu')
def process_string(s):
return s.lower().replace('&', 'and')
def levenshtein_distance(s1, s2):
return fuzz.ratio(s1, s2)
def compare_string_all(string, df):
string = string.lower().replace('&', 'and')
df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))
top_5_df = df.sort_values('distance', ascending=False).head(5)
top_5_df = top_5_df[['label','Ingredients', 'distance']]
return top_5_df
def compare_string_label(string, df):
string = string.lower().replace('&', 'and')
df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))
top_5_df = df.sort_values('distance', ascending=False).head(5)
top_5_df = top_5_df[['label','Ingredients', 'distance']]
return top_5_df
df= pd.read_json('cleaned.json')
df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']
df['cleaned_text']= df['label+ingradient'].apply(process_string)
df['cleaned_label'] = df['label'].apply(process_string)
index = pinecone.Index('companiessearch')
# Create a Streamlit app
def main():
st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
st.title("Company name matching App :smiley:")
# Define pages
pages = ["Semantic search"]
# Add radio buttons to toggle between pages
page = st.sidebar.radio("Select a page", pages)
# if page == pages[0]:
# st.header("Matches using levenshtein_distance")
# st.write("Enter a menu along with its ingredients:")
# st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
# input_string = st.text_input("")
# input_string= process_string(input_string)
# if input_string:
# st.write("Top 5 matches:")
# if len(input_string.split())>4:
# top_matches = compare_string_all(input_string, df)
# else:
# top_matches= compare_string_label(input_string, df)
# st.dataframe(top_matches)
if page == pages[0]:
st.header("Matches using embeddings (semantic search)")
st.write("Enter a company name:")
st.write("e.g. Airtel Africa Plc")
input_string = st.text_input("")
input_string = process_string(input_string)
if st.button("Enter"):
st.write("Top 5 matches using semantic search:")
# if len(input_string.split()) > 4:
# top_matches = compare_string_all(input_string, df)
# else:
# top_matches = compare_string_label(input_string, df)
xq = model.encode([input_string]).tolist()
result = index.query(xq, top_k=10, includeMetadata=True)
Name=[]
Country=[]
score=[]
for matches in result['matches']:
Name.append(matches['metadata']['name'])
Country.append(matches['metadata']['Country'])
score.append(matches['score'])
final_result= pd.DataFrame(list(zip(Name, Country, score)),
columns =['Company_name', 'Country','score' ])
st.dataframe(final_result)
if __name__ == "__main__":
main()
|