jeff-Tianfeng commited on
Commit
0614170
·
1 Parent(s): 3d66cc3

init_project

Browse files
Files changed (1) hide show
  1. MinRAG.py +234 -0
MinRAG.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ from typing import Tuple, Dict, List
5
+ import os
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+ from collections import Counter
9
+
10
+ class KnowledgePointEntropyAnalyzer:
11
+ """
12
+ Analyze the entropy of knowledge points in a binary message matrix.
13
+ """
14
+
15
+ def __init__(self, alpha: float = 1e-6):
16
+ """
17
+ Args:
18
+ alpha: Laplace smoothing factor to avoid zero probabilities
19
+ """
20
+ self.alpha = alpha
21
+
22
+ def add_background(self, B: np.ndarray) -> np.ndarray:
23
+ """
24
+ Add small background noise to avoid zero probabilities.
25
+
26
+ Args:
27
+ B: N x M binary matrix (messages x knowledge points)
28
+
29
+ Returns:
30
+ B_prime: smoothed matrix
31
+ """
32
+ n, M = B.shape
33
+ background = self.alpha / (n * M)
34
+ B_prime = B + background
35
+ return B_prime
36
+
37
+ def normalize_to_probability(self, B_prime: np.ndarray) -> np.ndarray:
38
+ """
39
+ Normalize the matrix to a probability distribution.
40
+ """
41
+ S = np.sum(B_prime)
42
+ P = B_prime / S
43
+ return P
44
+
45
+ def calculate_type2_entropy(self, P: np.ndarray) -> float:
46
+ """
47
+ Calculate Shannon entropy of the flattened probability distribution.
48
+
49
+ Args:
50
+ P: Probability matrix
51
+
52
+ Returns:
53
+ H_element: Shannon entropy value
54
+ """
55
+ P_flat = P.flatten()
56
+ P_nonzero = P_flat[P_flat > 0] # avoid log(0)
57
+ H_element = -np.sum(P_nonzero * np.log2(P_nonzero))
58
+ return H_element
59
+
60
+ def analyze(self, B: np.ndarray) -> Dict:
61
+ """
62
+ Analyze the entropy for a given sample matrix.
63
+
64
+ Args:
65
+ B: binary matrix of shape N x M
66
+
67
+ Returns:
68
+ Dictionary containing processed matrices and entropy values
69
+ """
70
+ B_prime = self.add_background(B)
71
+ P = self.normalize_to_probability(B_prime)
72
+ H_element = self.calculate_type2_entropy(P)
73
+
74
+ return {
75
+ 'B': B,
76
+ 'B_prime': B_prime,
77
+ 'P': P,
78
+ 'type2': H_element,
79
+ 'n_messages': B.shape[0],
80
+ 'n_knowledge_points': B.shape[1]
81
+ }
82
+
83
+ def run_sampling_entropy(matrix: np.ndarray,
84
+ sample_sizes: List[int],
85
+ n_trials: int,
86
+ alpha: float,
87
+ method: str = "random") -> pd.DataFrame:
88
+ """
89
+ Run entropy experiments under different sampling strategies.
90
+
91
+ Args:
92
+ matrix: Original binary matrix (N x M)
93
+ sample_sizes: List of sample sizes
94
+ n_trials: Number of trials per sample size
95
+ alpha: Laplace smoothing factor
96
+ method: "random" or "greedy"
97
+
98
+ Returns:
99
+ DataFrame of entropy results
100
+ """
101
+ analyzer = KnowledgePointEntropyAnalyzer(alpha=alpha)
102
+ records = []
103
+
104
+ for size in tqdm(sample_sizes, desc=f"{method} sampling"):
105
+ for trial in range(n_trials):
106
+ if method == "random":
107
+ # Random sampling with replacement
108
+ indices = np.random.choice(matrix.shape[0], size=size, replace=True)
109
+ sampled = matrix[indices]
110
+ elif method == "greedy":
111
+ # Greedy sampling prioritizing high-entropy knowledge points
112
+ sampled = greedy_entropy_sampling(matrix, n_select=size)
113
+ else:
114
+ raise ValueError(f"Unsupported sampling method: {method}")
115
+
116
+ result = analyzer.analyze(sampled)
117
+ log_n = np.log2(size)
118
+ records.append({
119
+ "method": method,
120
+ "sample_size": size,
121
+ "trial": trial,
122
+ "log_n": log_n,
123
+ "H_element": result['type2'],
124
+ "H_element_norm": result['type2'] / log_n
125
+ })
126
+
127
+ return pd.DataFrame(records)
128
+
129
+ def greedy_entropy_sampling(matrix: np.ndarray, n_select: int) -> np.ndarray:
130
+ """
131
+ Greedy sampling: select message rows that cover high-entropy knowledge points first.
132
+ (贪心采样:优先选择包含高熵知识点的消息行)
133
+
134
+ Args:
135
+ matrix: Original N x M binary knowledge point matrix
136
+ n_select: Number of messages to select
137
+
138
+ Returns:
139
+ Submatrix of size n_select x M
140
+ """
141
+ n, m = matrix.shape
142
+ B = matrix.copy()
143
+
144
+ # Step 1: Calculate marginal entropy for each knowledge point
145
+ def binary_entropy(p):
146
+ if p == 0 or p == 1:
147
+ return 0
148
+ return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
149
+
150
+ p_j = np.mean(B, axis=0)
151
+ H_j = np.array([binary_entropy(p) for p in p_j])
152
+ sorted_col_indices = np.argsort(-H_j) # sort by entropy descending
153
+
154
+ selected_rows = set()
155
+ covered_cols = set()
156
+
157
+ for col in sorted_col_indices:
158
+ # Step 2: Find rows containing this knowledge point
159
+ rows_with_col = set(np.where(B[:, col] == 1)[0])
160
+ candidate_rows = rows_with_col - selected_rows
161
+
162
+ for row in candidate_rows:
163
+ selected_rows.add(row)
164
+ covered_cols.add(col)
165
+ if len(selected_rows) >= n_select:
166
+ break
167
+ if len(selected_rows) >= n_select:
168
+ break
169
+
170
+ # Step 3: If not enough rows, fill randomly
171
+ if len(selected_rows) < n_select:
172
+ remaining = list(set(range(n)) - selected_rows)
173
+ supplement = np.random.choice(remaining, size=n_select - len(selected_rows), replace=False)
174
+ selected_rows.update(supplement)
175
+
176
+ selected_rows = sorted(list(selected_rows))
177
+ return B[selected_rows]
178
+
179
+
180
+ def plot(df_all: pd.DataFrame):
181
+ """
182
+ Plot the average entropy curves for different sampling methods.
183
+ """
184
+ df_avg = df_all.groupby(['sample_size', 'method']).agg({
185
+ 'H_element': 'mean',
186
+ 'H_element_norm': 'mean'
187
+ }).reset_index()
188
+
189
+ df_avg.to_csv("type2_entropy_averaged.csv", index=False)
190
+ print("✅ Averaged entropy results saved to type2_entropy_averaged.csv")
191
+
192
+ fig, axes = plt.subplots(1, 2, figsize=(14, 5))
193
+
194
+ # Plot raw Shannon entropy
195
+ sns.lineplot(
196
+ data=df_avg, x="sample_size", y="H_element",
197
+ hue="method", ax=axes[0], linewidth=2
198
+ )
199
+ axes[0].set_title("Shannon Entropy")
200
+ axes[0].set_xlabel("Sample Size")
201
+ axes[0].set_ylabel("Entropy Value")
202
+ axes[0].legend(title="Method")
203
+ axes[0].grid(False)
204
+
205
+ # Plot normalized entropy
206
+ sns.lineplot(
207
+ data=df_avg, x="sample_size", y="H_element_norm",
208
+ hue="method", ax=axes[1], linewidth=2
209
+ )
210
+ axes[1].set_title("Shannon Entropy / log2(n)")
211
+ axes[1].set_xlabel("Sample Size")
212
+ axes[1].set_ylabel("Unit entropy")
213
+ axes[1].legend(title="Method")
214
+ axes[1].grid(False)
215
+
216
+ plt.tight_layout()
217
+ plt.savefig("type2_entropy_comparison_smooth.png", dpi=300)
218
+ plt.show()
219
+ print("✅ Smoothed type-2 entropy plot saved as type2_entropy_comparison_smooth.png")
220
+
221
+ if __name__ == "__main__":
222
+ path = "build_matrix/matrix.npy"
223
+ matrix = np.load(path)
224
+ sample_sizes = list(range(50, 300, 50)) # Sampling sizes to evaluate
225
+ n_trials = 10 # Number of repeated trials for each sample size
226
+ alpha = 1e-6 # Laplace smoothing factor
227
+
228
+ # Run random sampling entropy analysis
229
+ df_random = run_sampling_entropy(matrix, sample_sizes, n_trials, alpha, method="random")
230
+ # Run greedy sampling entropy analysis
231
+ df_greedy = run_sampling_entropy(matrix, sample_sizes, n_trials, alpha, method="greedy")
232
+
233
+ df_all = pd.concat([df_random, df_greedy], ignore_index=True)
234
+ plot(df_all)