taskswithcode commited on
Commit
889fe1c
·
1 Parent(s): aea620e
Files changed (2) hide show
  1. app.py +1 -1
  2. twc_clustering.py +29 -5
app.py CHANGED
@@ -160,7 +160,7 @@ def display_results(orig_sentences,results,response_info,app_mode,model_name):
160
  main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
161
  main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model:&nbsp;<b>{model_name}</b></div>"
162
  score_text = "cosine distance"
163
- main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}.&nbsp;<b>{len(results['clusters'])} clusters</b>.&nbsp;&nbsp;mean:{results['info']['mean']:.2f}&nbsp;std:{results['info']['std']:.2f}&nbsp;threshold hints:{str(results['info']['zscores'])}</div>"
164
  body_sent = []
165
  download_data = {}
166
  for i in range(len(results["clusters"])):
 
160
  main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
161
  main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model:&nbsp;<b>{model_name}</b></div>"
162
  score_text = "cosine distance"
163
+ main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}.&nbsp;<b>{len(results['clusters'])} clusters</b>.&nbsp;&nbsp;mean:{results['info']['mean']:.2f};&nbsp;std:{results['info']['std']:.2f};&nbsp;current threshold:{results['info']['current_threshold']}<br/>Threshold hints:{str(results['info']['zscores'])}<br/>Overlap stats(overlap,freq):{str(results['info']['overlap'])}</div>"
164
  body_sent = []
165
  download_data = {}
166
  for i in range(len(results["clusters"])):
twc_clustering.py CHANGED
@@ -14,7 +14,7 @@ class TWCClustering:
14
  print("In Zscore Clustering")
15
 
16
  def compute_matrix(self,embeddings):
17
- print("Computing similarity matrix ...)")
18
  embeddings= np.array(embeddings)
19
  start = time.time()
20
  vec_a = embeddings.T #vec_a shape (1024,)
@@ -23,7 +23,7 @@ class TWCClustering:
23
  similarity_matrix = np.inner(vec_a,vec_a)
24
  end = time.time()
25
  time_val = (end-start)*1000
26
- print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
27
  return similarity_matrix
28
 
29
  def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
@@ -63,6 +63,24 @@ class TWCClustering:
63
  return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def cluster(self,output_file,texts,embeddings,threshold = 1.5):
67
  matrix = self.compute_matrix(embeddings)
68
  mean = np.mean(matrix)
@@ -71,13 +89,14 @@ class TWCClustering:
71
  inc = 0
72
  value = mean
73
  while (value < 1):
74
- zscores.append(round(value,2))
75
  inc += 1
76
  value = mean + inc*std
77
- print("In clustering:",round(std,2),zscores)
78
  cluster_dict = {}
79
  cluster_dict["clusters"] = []
80
  picked_dict = {}
 
81
 
82
  for i in range(len(embeddings)):
83
  if (i in picked_dict):
@@ -86,8 +105,13 @@ class TWCClustering:
86
  arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
87
  cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
88
  self.update_picked_dict(picked_dict,cluster_info["neighs"])
 
89
  cluster_dict["clusters"].append(cluster_info)
90
- cluster_dict["info"] ={"mean":mean,"std":std,"zscores":zscores}
 
 
 
 
91
  return cluster_dict
92
 
93
 
 
14
  print("In Zscore Clustering")
15
 
16
  def compute_matrix(self,embeddings):
17
+ #print("Computing similarity matrix ...)")
18
  embeddings= np.array(embeddings)
19
  start = time.time()
20
  vec_a = embeddings.T #vec_a shape (1024,)
 
23
  similarity_matrix = np.inner(vec_a,vec_a)
24
  end = time.time()
25
  time_val = (end-start)*1000
26
+ #print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
27
  return similarity_matrix
28
 
29
  def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
 
63
  return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
64
 
65
 
66
+ def update_overlap_stats(self,overlap_dict,cluster_info):
67
+ arr = list(cluster_info["neighs"].keys())
68
+ for val in arr:
69
+ if (val not in overlap_dict):
70
+ overlap_dict[val] = 1
71
+ else:
72
+ overlap_dict[val] += 1
73
+
74
+ def bucket_overlap(self,overlap_dict):
75
+ bucket_dict = {}
76
+ for key in overlap_dict:
77
+ if (overlap_dict[key] not in bucket_dict):
78
+ bucket_dict[overlap_dict[key]] = 1
79
+ else:
80
+ bucket_dict[overlap_dict[key]] += 1
81
+ sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
82
+ return sorted_d
83
+
84
  def cluster(self,output_file,texts,embeddings,threshold = 1.5):
85
  matrix = self.compute_matrix(embeddings)
86
  mean = np.mean(matrix)
 
89
  inc = 0
90
  value = mean
91
  while (value < 1):
92
+ zscores.append({"threshold":inc,"cosine":round(value,2)})
93
  inc += 1
94
  value = mean + inc*std
95
+ #print("In clustering:",round(std,2),zscores)
96
  cluster_dict = {}
97
  cluster_dict["clusters"] = []
98
  picked_dict = {}
99
+ overlap_dict = {}
100
 
101
  for i in range(len(embeddings)):
102
  if (i in picked_dict):
 
105
  arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
106
  cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
107
  self.update_picked_dict(picked_dict,cluster_info["neighs"])
108
+ self.update_overlap_stats(overlap_dict,cluster_info)
109
  cluster_dict["clusters"].append(cluster_info)
110
+ curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
111
+ sorted_d = OrderedDict(sorted(overlap_dict.items(), key=lambda kv: kv[1], reverse=True))
112
+ #print(sorted_d)
113
+ sorted_d = self.bucket_overlap(overlap_dict)
114
+ cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
115
  return cluster_dict
116
 
117