Spaces:
Build error
Build error
taskswithcode
commited on
Commit
·
889fe1c
1
Parent(s):
aea620e
Fix
Browse files- app.py +1 -1
- twc_clustering.py +29 -5
app.py
CHANGED
@@ -160,7 +160,7 @@ def display_results(orig_sentences,results,response_info,app_mode,model_name):
|
|
160 |
main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
|
161 |
main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model: <b>{model_name}</b></div>"
|
162 |
score_text = "cosine distance"
|
163 |
-
main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}. <b>{len(results['clusters'])} clusters</b>. mean:{results['info']['mean']:.2f}
|
164 |
body_sent = []
|
165 |
download_data = {}
|
166 |
for i in range(len(results["clusters"])):
|
|
|
160 |
main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
|
161 |
main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model: <b>{model_name}</b></div>"
|
162 |
score_text = "cosine distance"
|
163 |
+
main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}. <b>{len(results['clusters'])} clusters</b>. mean:{results['info']['mean']:.2f}; std:{results['info']['std']:.2f}; current threshold:{results['info']['current_threshold']}<br/>Threshold hints:{str(results['info']['zscores'])}<br/>Overlap stats(overlap,freq):{str(results['info']['overlap'])}</div>"
|
164 |
body_sent = []
|
165 |
download_data = {}
|
166 |
for i in range(len(results["clusters"])):
|
twc_clustering.py
CHANGED
@@ -14,7 +14,7 @@ class TWCClustering:
|
|
14 |
print("In Zscore Clustering")
|
15 |
|
16 |
def compute_matrix(self,embeddings):
|
17 |
-
print("Computing similarity matrix ...)")
|
18 |
embeddings= np.array(embeddings)
|
19 |
start = time.time()
|
20 |
vec_a = embeddings.T #vec_a shape (1024,)
|
@@ -23,7 +23,7 @@ class TWCClustering:
|
|
23 |
similarity_matrix = np.inner(vec_a,vec_a)
|
24 |
end = time.time()
|
25 |
time_val = (end-start)*1000
|
26 |
-
print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
|
27 |
return similarity_matrix
|
28 |
|
29 |
def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
|
@@ -63,6 +63,24 @@ class TWCClustering:
|
|
63 |
return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
|
64 |
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def cluster(self,output_file,texts,embeddings,threshold = 1.5):
|
67 |
matrix = self.compute_matrix(embeddings)
|
68 |
mean = np.mean(matrix)
|
@@ -71,13 +89,14 @@ class TWCClustering:
|
|
71 |
inc = 0
|
72 |
value = mean
|
73 |
while (value < 1):
|
74 |
-
zscores.append(round(value,2))
|
75 |
inc += 1
|
76 |
value = mean + inc*std
|
77 |
-
print("In clustering:",round(std,2),zscores)
|
78 |
cluster_dict = {}
|
79 |
cluster_dict["clusters"] = []
|
80 |
picked_dict = {}
|
|
|
81 |
|
82 |
for i in range(len(embeddings)):
|
83 |
if (i in picked_dict):
|
@@ -86,8 +105,13 @@ class TWCClustering:
|
|
86 |
arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
|
87 |
cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
|
88 |
self.update_picked_dict(picked_dict,cluster_info["neighs"])
|
|
|
89 |
cluster_dict["clusters"].append(cluster_info)
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
return cluster_dict
|
92 |
|
93 |
|
|
|
14 |
print("In Zscore Clustering")
|
15 |
|
16 |
def compute_matrix(self,embeddings):
|
17 |
+
#print("Computing similarity matrix ...)")
|
18 |
embeddings= np.array(embeddings)
|
19 |
start = time.time()
|
20 |
vec_a = embeddings.T #vec_a shape (1024,)
|
|
|
23 |
similarity_matrix = np.inner(vec_a,vec_a)
|
24 |
end = time.time()
|
25 |
time_val = (end-start)*1000
|
26 |
+
#print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
|
27 |
return similarity_matrix
|
28 |
|
29 |
def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
|
|
|
63 |
return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
|
64 |
|
65 |
|
66 |
+
def update_overlap_stats(self,overlap_dict,cluster_info):
|
67 |
+
arr = list(cluster_info["neighs"].keys())
|
68 |
+
for val in arr:
|
69 |
+
if (val not in overlap_dict):
|
70 |
+
overlap_dict[val] = 1
|
71 |
+
else:
|
72 |
+
overlap_dict[val] += 1
|
73 |
+
|
74 |
+
def bucket_overlap(self,overlap_dict):
|
75 |
+
bucket_dict = {}
|
76 |
+
for key in overlap_dict:
|
77 |
+
if (overlap_dict[key] not in bucket_dict):
|
78 |
+
bucket_dict[overlap_dict[key]] = 1
|
79 |
+
else:
|
80 |
+
bucket_dict[overlap_dict[key]] += 1
|
81 |
+
sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
|
82 |
+
return sorted_d
|
83 |
+
|
84 |
def cluster(self,output_file,texts,embeddings,threshold = 1.5):
|
85 |
matrix = self.compute_matrix(embeddings)
|
86 |
mean = np.mean(matrix)
|
|
|
89 |
inc = 0
|
90 |
value = mean
|
91 |
while (value < 1):
|
92 |
+
zscores.append({"threshold":inc,"cosine":round(value,2)})
|
93 |
inc += 1
|
94 |
value = mean + inc*std
|
95 |
+
#print("In clustering:",round(std,2),zscores)
|
96 |
cluster_dict = {}
|
97 |
cluster_dict["clusters"] = []
|
98 |
picked_dict = {}
|
99 |
+
overlap_dict = {}
|
100 |
|
101 |
for i in range(len(embeddings)):
|
102 |
if (i in picked_dict):
|
|
|
105 |
arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
|
106 |
cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
|
107 |
self.update_picked_dict(picked_dict,cluster_info["neighs"])
|
108 |
+
self.update_overlap_stats(overlap_dict,cluster_info)
|
109 |
cluster_dict["clusters"].append(cluster_info)
|
110 |
+
curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
|
111 |
+
sorted_d = OrderedDict(sorted(overlap_dict.items(), key=lambda kv: kv[1], reverse=True))
|
112 |
+
#print(sorted_d)
|
113 |
+
sorted_d = self.bucket_overlap(overlap_dict)
|
114 |
+
cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
|
115 |
return cluster_dict
|
116 |
|
117 |
|