kebeliu commited on
Commit
fae3a33
·
verified ·
1 Parent(s): ff8489f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -0
app.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import json
4
+ from transformers import AutoConfig
5
+ import math
6
+ from typing import Dict, Tuple, Optional
7
+
8
+
9
+ class LLMMemoryCalculator:
10
+ def __init__(self):
11
+ self.precision_bytes = {
12
+ 'fp32': 4,
13
+ 'fp16': 2,
14
+ 'bf16': 2,
15
+ 'int8': 1,
16
+ 'int4': 0.5
17
+ }
18
+
19
+ # -------------------------------------------------
20
+ # 📥 基础工具
21
+ # -------------------------------------------------
22
+ def get_model_config(self, model_id: str) -> Dict:
23
+ """获取模型配置"""
24
+ try:
25
+ config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
26
+ return config
27
+ except Exception as e:
28
+ raise Exception(f"无法获取模型配置: {str(e)}")
29
+
30
+ def get_file_size_from_url(self, model_id: str, filename: str) -> int:
31
+ """通过 HEAD 请求获取文件大小(备用)"""
32
+ try:
33
+ url = f"https://huggingface.co/{model_id}/resolve/main/{filename}"
34
+ response = requests.head(url, timeout=10)
35
+ if response.status_code == 200:
36
+ content_length = response.headers.get('Content-Length')
37
+ if content_length:
38
+ return int(content_length)
39
+ return 0
40
+ except:
41
+ return 0
42
+
43
+ # -------------------------------------------------
44
+ # 📦 获取模型权重大小
45
+ # -------------------------------------------------
46
+ def get_model_size_from_hf(self, model_id: str) -> Tuple[float, str]:
47
+ """优先使用 *.index.json 中的 metadata.total_size,回退到文件列表/HEAD"""
48
+ try:
49
+ # 1️⃣ 尝试读取 index.json(safetensors > pytorch)
50
+ for index_name, tag in [
51
+ ("model.safetensors.index.json", "safetensors_index"),
52
+ ("pytorch_model.bin.index.json", "pytorch_index")
53
+ ]:
54
+ url = f"https://huggingface.co/{model_id}/resolve/main/{index_name}"
55
+ resp = requests.get(url, timeout=10)
56
+ if resp.status_code == 200:
57
+ try:
58
+ data = resp.json()
59
+ except ValueError:
60
+ # 某些仓库 index.json 以文本形式存储,需要手动解析
61
+ data = json.loads(resp.text)
62
+ total_bytes = data.get("metadata", {}).get("total_size", 0)
63
+ if total_bytes > 0:
64
+ return total_bytes / (1024 ** 3), tag
65
+
66
+ # 2️⃣ 调用 Hub API,尝试直接读取 size 字段
67
+ api_url = f"https://huggingface.co/api/models/{model_id}"
68
+ response = requests.get(api_url, timeout=10)
69
+ if response.status_code != 200:
70
+ raise Exception(f"API请求失败: {response.status_code}")
71
+ model_info = response.json()
72
+
73
+ # 2a. 查找 siblings 列表中带 size 的 .safetensors 文件
74
+ safetensors_files = [f for f in model_info.get('siblings', [])
75
+ if f['rfilename'].endswith('.safetensors') and 'size' in f]
76
+ if safetensors_files:
77
+ total_size = sum(f['size'] for f in safetensors_files)
78
+ return total_size / (1024 ** 3), "safetensors_files"
79
+
80
+ # 2b. 使用 HEAD 请求补全未包含 size 的 .safetensors 文件
81
+ safetensors_no_size = [f for f in model_info.get('siblings', [])
82
+ if f['rfilename'].endswith('.safetensors')]
83
+ if safetensors_no_size:
84
+ total_size = 0
85
+ for f in safetensors_no_size:
86
+ total_size += self.get_file_size_from_url(model_id, f['rfilename'])
87
+ if total_size > 0:
88
+ return total_size / (1024 ** 3), "safetensors_head"
89
+
90
+ # 2c. 同理处理 pytorch_model-xxxxx.bin
91
+ pytorch_files = [f for f in model_info.get('siblings', [])
92
+ if f['rfilename'].endswith('.bin') and 'size' in f]
93
+ if pytorch_files:
94
+ total_size = sum(f['size'] for f in pytorch_files)
95
+ return total_size / (1024 ** 3), "pytorch_files"
96
+
97
+ pytorch_no_size = [f for f in model_info.get('siblings', [])
98
+ if f['rfilename'].endswith('.bin')]
99
+ if pytorch_no_size:
100
+ total_size = 0
101
+ for f in pytorch_no_size:
102
+ total_size += self.get_file_size_from_url(model_id, f['rfilename'])
103
+ if total_size > 0:
104
+ return total_size / (1024 ** 3), "pytorch_head"
105
+
106
+ # 3️⃣ 如果仍然无法确定大小,走估算逻辑
107
+ raise Exception("未找到权重大小信息")
108
+
109
+ except Exception:
110
+ # 估算
111
+ return self.estimate_model_size_from_config(model_id)
112
+
113
+ # -------------------------------------------------
114
+ # 📐 估算逻辑(与原始保持一致)
115
+ # -------------------------------------------------
116
+ def estimate_model_size_from_config(self, model_id: str) -> Tuple[float, str]:
117
+ """根据 config.json 估算模型大小(FP16)"""
118
+ try:
119
+ config = self.get_model_config(model_id)
120
+
121
+ vocab_size = getattr(config, 'vocab_size', 50000)
122
+ hidden_size = getattr(config, 'hidden_size', getattr(config, 'd_model', 4096))
123
+ num_layers = getattr(config, 'num_hidden_layers', getattr(config, 'num_layers', 32))
124
+ intermediate_size = getattr(config, 'intermediate_size', hidden_size * 4)
125
+
126
+ # Embedding
127
+ embedding_params = vocab_size * hidden_size
128
+
129
+ # Transformer layer
130
+ attention_params = 4 * hidden_size * hidden_size
131
+ ffn_params = 2 * hidden_size * intermediate_size
132
+ ln_params = 2 * hidden_size
133
+ params_per_layer = attention_params + ffn_params + ln_params
134
+
135
+ total_params = embedding_params + num_layers * params_per_layer
136
+ if hasattr(config, 'tie_word_embeddings') and not config.tie_word_embeddings:
137
+ total_params += vocab_size * hidden_size
138
+
139
+ model_size_gb = (total_params * 2) / (1024 ** 3) # 默认 fp16
140
+ return model_size_gb, "estimated"
141
+
142
+ except Exception as e:
143
+ raise Exception(f"无法估算模型大小: {str(e)}")
144
+
145
+ # -------------------------------------------------
146
+ # 🗄️ KV Cache 计算(原逻辑保持)
147
+ # -------------------------------------------------
148
+ def calculate_kv_cache_size(self, config, context_length: int, batch_size: int = 1) -> Dict[str, float]:
149
+ try:
150
+ num_layers = getattr(config, 'num_hidden_layers', getattr(config, 'num_layers', 32))
151
+ hidden_size = getattr(config, 'hidden_size', getattr(config, 'd_model', 4096))
152
+ num_attention_heads = getattr(config, 'num_attention_heads', getattr(config, 'num_heads', 32))
153
+ num_key_value_heads = getattr(config, 'num_key_value_heads', num_attention_heads)
154
+ is_mla = hasattr(config, 'kv_lora_rank') and config.kv_lora_rank is not None
155
+ head_dim = hidden_size // num_attention_heads
156
+
157
+ if is_mla:
158
+ kv_lora_rank = getattr(config, 'kv_lora_rank', 512)
159
+ kv_cache_per_token = kv_lora_rank * 2
160
+ attention_type = "MLA"
161
+ elif num_key_value_heads < num_attention_heads:
162
+ kv_cache_per_token = num_key_value_heads * head_dim * 2
163
+ attention_type = "GQA"
164
+ else:
165
+ kv_cache_per_token = num_attention_heads * head_dim * 2
166
+ attention_type = "MHA"
167
+
168
+ total_kv_cache = (kv_cache_per_token * context_length * num_layers * batch_size * 2) / (1024 ** 3)
169
+ return {
170
+ 'size_gb': total_kv_cache,
171
+ 'attention_type': attention_type,
172
+ 'num_kv_heads': num_key_value_heads,
173
+ 'num_attention_heads': num_attention_heads,
174
+ 'head_dim': head_dim
175
+ }
176
+ except Exception as e:
177
+ raise Exception(f"计算KV Cache失败: {str(e)}")
178
+
179
+ # -------------------------------------------------
180
+ # 🧮 综合内存需求计算(保持不变)
181
+ # -------------------------------------------------
182
+ def calculate_memory_requirements(self, model_id: str, gpu_memory_gb: float, num_gpus: int,
183
+ context_length: int, utilization_rate: float = 0.9) -> Dict:
184
+ try:
185
+ config = self.get_model_config(model_id)
186
+ model_size_gb, size_source = self.get_model_size_from_hf(model_id)
187
+ kv_info = self.calculate_kv_cache_size(config, context_length)
188
+
189
+ available_memory = gpu_memory_gb * num_gpus * utilization_rate
190
+ other_overhead = model_size_gb * 0.1
191
+ total_memory_needed = model_size_gb + kv_info['size_gb'] + other_overhead
192
+
193
+ is_feasible = total_memory_needed <= available_memory
194
+ memory_margin = available_memory - total_memory_needed
195
+ memory_per_gpu = total_memory_needed / num_gpus
196
+
197
+ return {
198
+ 'model_id': model_id,
199
+ 'model_size_gb': round(model_size_gb, 2),
200
+ 'size_source': size_source,
201
+ 'kv_cache_gb': round(kv_info['size_gb'], 2),
202
+ 'attention_type': kv_info['attention_type'],
203
+ 'other_overhead_gb': round(other_overhead, 2),
204
+ 'total_memory_needed_gb': round(total_memory_needed, 2),
205
+ 'available_memory_gb': round(available_memory, 2),
206
+ 'memory_margin_gb': round(memory_margin, 2),
207
+ 'memory_per_gpu_gb': round(memory_per_gpu, 2),
208
+ 'is_feasible': is_feasible,
209
+ 'utilization_per_gpu': round((memory_per_gpu / gpu_memory_gb) * 100, 1),
210
+ 'config_info': {
211
+ 'num_layers': getattr(config, 'num_hidden_layers', getattr(config, 'num_layers', 'N/A')),
212
+ 'hidden_size': getattr(config, 'hidden_size', getattr(config, 'd_model', 'N/A')),
213
+ 'num_attention_heads': kv_info['num_attention_heads'],
214
+ 'num_kv_heads': kv_info['num_kv_heads'],
215
+ 'head_dim': kv_info['head_dim']
216
+ }
217
+ }
218
+ except Exception as e:
219
+ return {'error': str(e)}
220
+
221
+
222
+ # -------------------------------------------------
223
+ # 🌟 Gradio 界面构建(保持原逻辑)
224
+ # -------------------------------------------------
225
+
226
+ def create_gradio_interface():
227
+ calculator = LLMMemoryCalculator()
228
+
229
+ def calculate_memory(model_id, gpu_memory, num_gpus, context_length, utilization_rate):
230
+ if not model_id.strip():
231
+ return "请输入模型ID"
232
+
233
+ try:
234
+ result = calculator.calculate_memory_requirements(
235
+ model_id.strip(),
236
+ float(gpu_memory),
237
+ int(num_gpus),
238
+ int(context_length),
239
+ float(utilization_rate) / 100
240
+ )
241
+
242
+ if 'error' in result:
243
+ return f"❌ 错误: {result['error']}"
244
+
245
+ status = "✅ 可以运行" if result['is_feasible'] else "❌ 显存不足"
246
+
247
+ output = f"""
248
+ ## 模型分析结果
249
+
250
+ **模型**: {result['model_id']}
251
+ **状态**: {status}
252
+
253
+ ### 📊 内存分析
254
+ - **模型大小**: {result['model_size_gb']} GB ({result['size_source']})
255
+ - **KV Cache**: {result['kv_cache_gb']} GB
256
+ - **其他开销**: {result['other_overhead_gb']} GB
257
+ - **总需求**: {result['total_memory_needed_gb']} GB
258
+ - **可用显存**: {result['available_memory_gb']} GB
259
+ - **剩余显存**: {result['memory_margin_gb']} GB
260
+
261
+ ### 🔧 模型配置
262
+ - **注意力类型**: {result['attention_type']}
263
+ - **层数**: {result['config_info']['num_layers']}
264
+ - **隐藏维度**: {result['config_info']['hidden_size']}
265
+ - **注意力头数**: {result['config_info']['num_attention_heads']}
266
+ - **KV头数**: {result['config_info']['num_kv_heads']}
267
+ - **头维度**: {result['config_info']['head_dim']}
268
+
269
+ ### 💾 GPU使用情况
270
+ - **每GPU内存**: {result['memory_per_gpu_gb']} GB
271
+ - **每GPU利用率**: {result['utilization_per_gpu']}%
272
+
273
+ ### 💡 建议
274
+ """
275
+ if result['is_feasible']:
276
+ output += f"✅ 当前配置可以成功运行该模型。剩余 {result['memory_margin_gb']} GB 显存。"
277
+ else:
278
+ needed_extra = abs(result['memory_margin_gb'])
279
+ output += f"❌ 需要额外 {needed_extra} GB 显存才能运行。\n建议:\n- 增加GPU数量\n- 使用更大显存的GPU\n- 减少上下文长度\n- 使用模型量化(如int8/int4)"
280
+
281
+ return output
282
+ except Exception as e:
283
+ return f"❌ 计算出错: {str(e)}"
284
+
285
+ with gr.Blocks(title="LLM GPU内存计算器", theme=gr.themes.Soft()) as demo:
286
+ gr.Markdown("# 🚀 LLM GPU内存需求计算器")
287
+ gr.Markdown("输入模型信息和硬件配置,计算是否能够成功运行大语言模型")
288
+
289
+ with gr.Row():
290
+ with gr.Column(scale=1):
291
+ gr.Markdown("## 📝 输入参数")
292
+
293
+ model_id = gr.Textbox(label="🤗 Hugging Face 模型ID",
294
+ placeholder="例如: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
295
+ value="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B")
296
+
297
+ with gr.Row():
298
+ gpu_memory = gr.Number(label="💾 单张GPU显存 (GB)", value=24, minimum=1, maximum=1000)
299
+ num_gpus = gr.Number(label="🔢 GPU数量", value=1, minimum=1, maximum=64, precision=0)
300
+
301
+ with gr.Row():
302
+ context_length = gr.Number(label="📏 上下文长度", value=16384, minimum=512, maximum=1000000, precision=0)
303
+ utilization_rate = gr.Slider(label="⚡ 显存利用率 (%)", minimum=50, maximum=95, value=90, step=5)
304
+
305
+ calculate_btn = gr.Button("🔍 计算内存需求", variant="primary")
306
+
307
+ with gr.Column(scale=2):
308
+ gr.Markdown("## 📊 计算结果")
309
+ output = gr.Markdown("点击计算按钮开始分析...")
310
+
311
+ calculate_btn.click(fn=calculate_memory,
312
+ inputs=[model_id, gpu_memory, num_gpus, context_length, utilization_rate],
313
+ outputs=output)
314
+
315
+ gr.Markdown("""
316
+ ## 📚 使用示例
317
+
318
+ **小型模型**: `microsoft/DialoGPT-medium`
319
+ **中型模型**: `microsoft/DialoGPT-large`
320
+ **大型模型**: `meta-llama/Llama-2-7b-hf`
321
+ **超大模型**: `meta-llama/Llama-2-13b-hf`
322
+
323
+ 注意:某些模型可能需要申请访问权限。
324
+ """)
325
+
326
+ return demo
327
+
328
+
329
+ if __name__ == "__main__":
330
+ demo = create_gradio_interface()
331
+ demo.launch(share=True, debug=True)