jeff-Tianfeng commited on
Commit
418f8e3
·
1 Parent(s): 0614170

init_project

Browse files
Files changed (1) hide show
  1. MinRAG.py +0 -234
MinRAG.py DELETED
@@ -1,234 +0,0 @@
1
- import numpy as np
2
- import matplotlib.pyplot as plt
3
- import seaborn as sns
4
- from typing import Tuple, Dict, List
5
- import os
6
- import pandas as pd
7
- from tqdm import tqdm
8
- from collections import Counter
9
-
10
- class KnowledgePointEntropyAnalyzer:
11
- """
12
- Analyze the entropy of knowledge points in a binary message matrix.
13
- """
14
-
15
- def __init__(self, alpha: float = 1e-6):
16
- """
17
- Args:
18
- alpha: Laplace smoothing factor to avoid zero probabilities
19
- """
20
- self.alpha = alpha
21
-
22
- def add_background(self, B: np.ndarray) -> np.ndarray:
23
- """
24
- Add small background noise to avoid zero probabilities.
25
-
26
- Args:
27
- B: N x M binary matrix (messages x knowledge points)
28
-
29
- Returns:
30
- B_prime: smoothed matrix
31
- """
32
- n, M = B.shape
33
- background = self.alpha / (n * M)
34
- B_prime = B + background
35
- return B_prime
36
-
37
- def normalize_to_probability(self, B_prime: np.ndarray) -> np.ndarray:
38
- """
39
- Normalize the matrix to a probability distribution.
40
- """
41
- S = np.sum(B_prime)
42
- P = B_prime / S
43
- return P
44
-
45
- def calculate_type2_entropy(self, P: np.ndarray) -> float:
46
- """
47
- Calculate Shannon entropy of the flattened probability distribution.
48
-
49
- Args:
50
- P: Probability matrix
51
-
52
- Returns:
53
- H_element: Shannon entropy value
54
- """
55
- P_flat = P.flatten()
56
- P_nonzero = P_flat[P_flat > 0] # avoid log(0)
57
- H_element = -np.sum(P_nonzero * np.log2(P_nonzero))
58
- return H_element
59
-
60
- def analyze(self, B: np.ndarray) -> Dict:
61
- """
62
- Analyze the entropy for a given sample matrix.
63
-
64
- Args:
65
- B: binary matrix of shape N x M
66
-
67
- Returns:
68
- Dictionary containing processed matrices and entropy values
69
- """
70
- B_prime = self.add_background(B)
71
- P = self.normalize_to_probability(B_prime)
72
- H_element = self.calculate_type2_entropy(P)
73
-
74
- return {
75
- 'B': B,
76
- 'B_prime': B_prime,
77
- 'P': P,
78
- 'type2': H_element,
79
- 'n_messages': B.shape[0],
80
- 'n_knowledge_points': B.shape[1]
81
- }
82
-
83
- def run_sampling_entropy(matrix: np.ndarray,
84
- sample_sizes: List[int],
85
- n_trials: int,
86
- alpha: float,
87
- method: str = "random") -> pd.DataFrame:
88
- """
89
- Run entropy experiments under different sampling strategies.
90
-
91
- Args:
92
- matrix: Original binary matrix (N x M)
93
- sample_sizes: List of sample sizes
94
- n_trials: Number of trials per sample size
95
- alpha: Laplace smoothing factor
96
- method: "random" or "greedy"
97
-
98
- Returns:
99
- DataFrame of entropy results
100
- """
101
- analyzer = KnowledgePointEntropyAnalyzer(alpha=alpha)
102
- records = []
103
-
104
- for size in tqdm(sample_sizes, desc=f"{method} sampling"):
105
- for trial in range(n_trials):
106
- if method == "random":
107
- # Random sampling with replacement
108
- indices = np.random.choice(matrix.shape[0], size=size, replace=True)
109
- sampled = matrix[indices]
110
- elif method == "greedy":
111
- # Greedy sampling prioritizing high-entropy knowledge points
112
- sampled = greedy_entropy_sampling(matrix, n_select=size)
113
- else:
114
- raise ValueError(f"Unsupported sampling method: {method}")
115
-
116
- result = analyzer.analyze(sampled)
117
- log_n = np.log2(size)
118
- records.append({
119
- "method": method,
120
- "sample_size": size,
121
- "trial": trial,
122
- "log_n": log_n,
123
- "H_element": result['type2'],
124
- "H_element_norm": result['type2'] / log_n
125
- })
126
-
127
- return pd.DataFrame(records)
128
-
129
- def greedy_entropy_sampling(matrix: np.ndarray, n_select: int) -> np.ndarray:
130
- """
131
- Greedy sampling: select message rows that cover high-entropy knowledge points first.
132
- (贪心采样:优先选择包含高熵知识点的消息行)
133
-
134
- Args:
135
- matrix: Original N x M binary knowledge point matrix
136
- n_select: Number of messages to select
137
-
138
- Returns:
139
- Submatrix of size n_select x M
140
- """
141
- n, m = matrix.shape
142
- B = matrix.copy()
143
-
144
- # Step 1: Calculate marginal entropy for each knowledge point
145
- def binary_entropy(p):
146
- if p == 0 or p == 1:
147
- return 0
148
- return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
149
-
150
- p_j = np.mean(B, axis=0)
151
- H_j = np.array([binary_entropy(p) for p in p_j])
152
- sorted_col_indices = np.argsort(-H_j) # sort by entropy descending
153
-
154
- selected_rows = set()
155
- covered_cols = set()
156
-
157
- for col in sorted_col_indices:
158
- # Step 2: Find rows containing this knowledge point
159
- rows_with_col = set(np.where(B[:, col] == 1)[0])
160
- candidate_rows = rows_with_col - selected_rows
161
-
162
- for row in candidate_rows:
163
- selected_rows.add(row)
164
- covered_cols.add(col)
165
- if len(selected_rows) >= n_select:
166
- break
167
- if len(selected_rows) >= n_select:
168
- break
169
-
170
- # Step 3: If not enough rows, fill randomly
171
- if len(selected_rows) < n_select:
172
- remaining = list(set(range(n)) - selected_rows)
173
- supplement = np.random.choice(remaining, size=n_select - len(selected_rows), replace=False)
174
- selected_rows.update(supplement)
175
-
176
- selected_rows = sorted(list(selected_rows))
177
- return B[selected_rows]
178
-
179
-
180
- def plot(df_all: pd.DataFrame):
181
- """
182
- Plot the average entropy curves for different sampling methods.
183
- """
184
- df_avg = df_all.groupby(['sample_size', 'method']).agg({
185
- 'H_element': 'mean',
186
- 'H_element_norm': 'mean'
187
- }).reset_index()
188
-
189
- df_avg.to_csv("type2_entropy_averaged.csv", index=False)
190
- print("✅ Averaged entropy results saved to type2_entropy_averaged.csv")
191
-
192
- fig, axes = plt.subplots(1, 2, figsize=(14, 5))
193
-
194
- # Plot raw Shannon entropy
195
- sns.lineplot(
196
- data=df_avg, x="sample_size", y="H_element",
197
- hue="method", ax=axes[0], linewidth=2
198
- )
199
- axes[0].set_title("Shannon Entropy")
200
- axes[0].set_xlabel("Sample Size")
201
- axes[0].set_ylabel("Entropy Value")
202
- axes[0].legend(title="Method")
203
- axes[0].grid(False)
204
-
205
- # Plot normalized entropy
206
- sns.lineplot(
207
- data=df_avg, x="sample_size", y="H_element_norm",
208
- hue="method", ax=axes[1], linewidth=2
209
- )
210
- axes[1].set_title("Shannon Entropy / log2(n)")
211
- axes[1].set_xlabel("Sample Size")
212
- axes[1].set_ylabel("Unit entropy")
213
- axes[1].legend(title="Method")
214
- axes[1].grid(False)
215
-
216
- plt.tight_layout()
217
- plt.savefig("type2_entropy_comparison_smooth.png", dpi=300)
218
- plt.show()
219
- print("✅ Smoothed type-2 entropy plot saved as type2_entropy_comparison_smooth.png")
220
-
221
- if __name__ == "__main__":
222
- path = "build_matrix/matrix.npy"
223
- matrix = np.load(path)
224
- sample_sizes = list(range(50, 300, 50)) # Sampling sizes to evaluate
225
- n_trials = 10 # Number of repeated trials for each sample size
226
- alpha = 1e-6 # Laplace smoothing factor
227
-
228
- # Run random sampling entropy analysis
229
- df_random = run_sampling_entropy(matrix, sample_sizes, n_trials, alpha, method="random")
230
- # Run greedy sampling entropy analysis
231
- df_greedy = run_sampling_entropy(matrix, sample_sizes, n_trials, alpha, method="greedy")
232
-
233
- df_all = pd.concat([df_random, df_greedy], ignore_index=True)
234
- plot(df_all)