Spaces:

naveed-stockmark
/

kg_reasoning_demo

Runtime error

App Files Files Community

kg_reasoning_demo / app_old.py

naveed-stockmark

Rename app.py to app_old.py

cc570fd verified about 1 year ago

raw

history blame contribute delete

8.65 kB

	import pandas as pd
	from utils import normalize_text
	import streamlit as st

	### Data paths
	# WIKIPEDIA_PATH = "./kensho_en_wiki_typing_technical.csv"
	# WIKIDATA_PATH = "./wikidata_ss_processed.csv"
	# REBEL_INFER_PATH = "./rebel_inference_processed_ss.csv"
	# ENTITY_LINKING_PATH = "./linking_df_technical_min.csv"

	relation_to_id = {
	"uses": 2283,
	"has_use": 366,
	"part_of": 361,
	"has_part": 527,
	"made_from_material": 186
	}

	st.title("Materials use case search app")



	# User Input
	input_text = st.text_input(
	label="Enter the name of a material i.e steel, sand, plastic, etc and press Enter",
	value="steel",
	key="ent",
	)

	st.write("preparing data ...")

	# Wikipedia metadata
	@st.cache_data(persist="disk")
	def get_wiki_df(path="./kensho_en_wiki_typing_technical.csv"):
	wiki_df = pd.read_csv(path)

	# filter out technical articles
	exclude_ids = set(wiki_df[(wiki_df.exclude == True) \| (wiki_df.technical == False)].page_id.to_list())
	include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())

	skpe_to_wikidata = dict(zip(wiki_df.skpe_id.to_list(), wiki_df.item_id.to_list()))

	wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude'])
	wiki_df = wiki_df.rename(columns={'title_x': 'en_title'})

	return wiki_df, include_skpes, skpe_to_wikidata

	wiki_df, include_skpes, skpe_to_wikidata = get_wiki_df()

	# KG data source 1: Wikidata
	@st.cache_data(persist="disk")
	def get_wikidata_df(path="./wikidata_ss_processed.csv"):
	wikidata_df = pd.read_csv(path)

	# filter technical wikidata
	wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]

	wikidata_df['source_wikidata'] = wikidata_df.source_skpe.apply(lambda x: skpe_to_wikidata[x])
	wikidata_df['target_wikidata'] = wikidata_df.target_skpe.apply(lambda x: skpe_to_wikidata[x])
	wikidata_df = wikidata_df.drop(columns=['source_skpe', 'target_skpe'])

	wikidata_df['source'] = 'wikidata'

	return wikidata_df

	wikidata_df = get_wikidata_df()

	@st.cache_data(persist="disk")
	def get_rebel_infer_df(path="./rebel_inference_processed_ss.csv"):
	rebel_infer_df = pd.read_csv(path)

	# filter technical
	rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]

	rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: x.source_skpe_id in skpe_to_wikidata.keys() and x.target_skpe_id in skpe_to_wikidata.keys(), axis=1)]
	rebel_infer_df['source_wikidata'] = rebel_infer_df.source_skpe_id.apply(lambda x: skpe_to_wikidata[x])
	rebel_infer_df['target_wikidata'] = rebel_infer_df.target_skpe_id.apply(lambda x: skpe_to_wikidata[x])
	# rebel_infer_df['title_page_id'] = rebel_infer_df.page_skpe_id.apply(lambda x: skpe_to_wikidata[x])

	rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text', 'page_skpe_id', 'source_skpe_id', 'target_skpe_id'])
	rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'})
	rebel_infer_df = rebel_infer_df[rebel_infer_df.source_wikidata != rebel_infer_df.target_wikidata]

	rebel_infer_df['source'] = 'rebel_wikipedia'

	return rebel_infer_df

	rebel_infer_df = get_rebel_infer_df()

	kg_df = pd.concat([wikidata_df, rebel_infer_df])

	@st.cache_data(persist="disk")
	def get_entity_linking_df(path="./linking_df_technical_min.csv"):
	linking_df = pd.read_csv(path)
	return linking_df

	st.write("matching input text ...")

	linking_df = get_entity_linking_df()

	# normalise and match
	text_norm = normalize_text(input_text)
	match_df = linking_df[linking_df.text == text_norm]

	match_df = match_df[match_df.skpe_id.apply(lambda x: x in skpe_to_wikidata.keys())]
	match_df['wikidata_id'] = match_df.skpe_id.apply(lambda x: skpe_to_wikidata[x])

	# top match skpe
	if len(match_df) > 0:

	top_wikidata = match_df.wikidata_id.mode()[0]
	all_wikidata = set(match_df.wikidata_id.to_list())
	wikidata_to_count = dict(match_df.wikidata_id.value_counts())

	# Match list
	wiki_match_df = wiki_df[wiki_df.item_id.apply(lambda x: x in all_wikidata)].copy()
	wiki_match_df['link_score'] = wiki_match_df['item_id'].apply(lambda x: wikidata_to_count[x] / sum(wikidata_to_count.values()))
	wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)

	# show similar results
	st.write(f"Found following matches for the term {input_text}")
	wiki_match_df.sort_values(by='views', ascending=False)[:5]

	# proceeding with top match
	st.write("Performing use case extraction for the following top match ...")
	wiki_df[wiki_df.item_id.apply(lambda x: x == top_wikidata)]

	# Stuff that are made out of input
	made_of_df = kg_df[(kg_df.relation == 'made_from_material') & (kg_df.target_wikidata == top_wikidata)].copy()
	# made_of_list = made_of_df.source_wikidata.to_list()

	if len(made_of_df) > 0:

	st.write(f"Discovered following entities made out of {input_text}")
	made_of_df[['source_ja', 'source_en', 'relation', 'target_ja', 'target_en', 'source', 'page_title']]

	st.write("Extracting knowledge graph paths ...")

	all_paths = []

	# iterate over first rows
	for first_edge in made_of_df.itertuples():

	first_item = first_edge.source_wikidata

	# applications of stuff made out of first item
	use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_wikidata == first_item)) \| ((kg_df.relation == 'uses') & (kg_df.target_wikidata == first_item))]

	# add all 2 len paths
	for second_edge in use_df.itertuples():
	all_paths.append([first_edge, second_edge])

	# expand to part of

	# applications of stuff made out of steel # 1
	part_df = kg_df[((kg_df.relation == 'has_part') & (kg_df.target_wikidata == first_item)) \| (kg_df.relation == 'part_of') & (kg_df.source_wikidata == first_item)]

	# iterate over all parts of product
	for second_edge in part_df.itertuples():

	# select second item
	second_item = second_edge.source_wikidata if second_edge.relation == 'has_part' else second_edge.target_wikidata

	# get uses of second item
	use_df = kg_df[((kg_df.relation == 'has_use') & (kg_df.source_wikidata == second_item)) \| ((kg_df.relation == 'uses') & (kg_df.target_wikidata == second_item))]

	# add all 3 len paths
	for third_edge in use_df.itertuples():
	all_paths.append([first_edge, second_edge, third_edge])

	if len(all_paths) > 0:

	st.write(f"Found {len(all_paths)} knowledge graph paths relevant to use cases of {input_text}")
	st.write("------")

	# print all paths
	for i, path in enumerate(all_paths):
	material = path[0].target_en
	material_wikidata = path[0].target_wikidata
	material_url = f"https://www.wikidata.org/wiki/Q{material_wikidata}"

	use_case = path[-1].source_en if path[-1].relation == 'uses' else path[-1].target_en
	use_case_wikidata = path[-1].source_wikidata if path[-1].relation == 'uses' else path[-1].target_wikidata
	use_case_url = f"https://www.wikidata.org/wiki/Q{use_case_wikidata}"

	st.write(f"Reasoning Path {i+1}:")

	for edge in path:

	source_url = f"https://www.wikidata.org/wiki/Q{edge.source_wikidata}"
	target_url = f"https://www.wikidata.org/wiki/Q{edge.target_wikidata}"

	relation_url = f"https://www.wikidata.org/wiki/Property:P{relation_to_id[edge.relation]}"

	st.markdown(f"[{edge.source_en}]({source_url}) --[{edge.relation}]({relation_url})--> [{edge.target_en}]({target_url}) (source: {edge.source})")
	st.write("Conclusion:")
	st.write(f"[{material}]({material_url}) is useful for [{use_case}]({use_case_url})")
	st.write("------")
	else:
	st.write("Found no knowledge graph paths relevant to use cases")
	else:
	st.write("Found no entities that are made from {input_text}")

	else:
	st.write("no matches")