upload code0304

Browse files

Files changed (11) hide show

.gitattributes +1 -0
.ipynb_checkpoints/eval-checkpoint.ipynb +395 -0
.ipynb_checkpoints/graph_train-checkpoint.ipynb +1591 -0
.ipynb_checkpoints/graph_train2-checkpoint.ipynb +1674 -0
.ipynb_checkpoints/graph_train3-checkpoint.ipynb +1588 -0
eval.ipynb +406 -0
final_Graph.json +3 -0
graph_train.ipynb +1591 -0
graph_train2.ipynb +1506 -0
graph_train3.ipynb +1588 -0
train_data.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+final_Graph.json filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/eval-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,395 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "MODEL_NAME = \"/workspace/model\"\n",
+    "model_token = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_token)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'What are the signal definitions in the Verilog code for the calculator module, and what are their purposes?'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "question1 = conversations[0][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "question1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "from torch.utils.data import Dataset\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.model = AutoModelForCausalLM.from_config(config)\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding.squeeze(0))  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "        model.graph_proj = nn.Linear(512, model.config.hidden_size)\n",
+    "        return model\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "         -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "         -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "         -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "         -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "          0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "         -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "         -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "         -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "         -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "          0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "          0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "         -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "          1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "         -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "         -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "         -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "         -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "         -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "          0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "          0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "         -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "          1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "          0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "         -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "         -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "         -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "         -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "         -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "          2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "         -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "         -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "         -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "          0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "         -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "         -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "          1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "         -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "         -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "         -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "         -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "         -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "          1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "          0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "         -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "          1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "         -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "          0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "         -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "         -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "          2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "         -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "          0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "         -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "         -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "          0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "          0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "         -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "          0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "         -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "         -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "         -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "         -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "          1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 加载分词器\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_token)\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "graph_embedding.to(device)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "The size of tensor a (23) must match the size of tensor b (22) at non-singleton dimension 3",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 6\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(max_new_tokens):\n\u001b[1;32m      4\u001b[0m     \u001b[38;5;66;03m# ✅ 计算 logits 并进行生成\u001b[39;00m\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m----> 6\u001b[0m         outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m            \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerated_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# (batch_size, seq_len)\u001b[39;49;00m\n\u001b[1;32m      8\u001b[0m \u001b[43m            \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mattention_mask\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# (batch_size, seq_len)\u001b[39;49;00m\n\u001b[1;32m      9\u001b[0m \u001b[43m            \u001b[49m\u001b[43mgraph_embedding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_embedding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m      \u001b[49m\u001b[38;5;66;43;03m# (batch_size, 512)\u001b[39;49;00m\n\u001b[1;32m     10\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     13\u001b[0m     logits \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mlogits[:, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, :]  \u001b[38;5;66;03m# 取最后一个 token 的 logits\u001b[39;00m\n\u001b[1;32m     14\u001b[0m     next_token \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, keepdim\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)  \u001b[38;5;66;03m# 贪心解码\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/utils/deprecation.py:172\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>.wrapper.<locals>.wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    168\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m minimum_action \u001b[38;5;129;01min\u001b[39;00m (Action\u001b[38;5;241m.\u001b[39mNOTIFY, Action\u001b[38;5;241m.\u001b[39mNOTIFY_ALWAYS) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torchdynamo_compiling():\n\u001b[1;32m    169\u001b[0m     \u001b[38;5;66;03m# DeprecationWarning is ignored by default, so we use FutureWarning instead\u001b[39;00m\n\u001b[1;32m    170\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:856\u001b[0m, in \u001b[0;36mQwen2ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[1;32m    853\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m    855\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    857\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    858\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    859\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    860\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    861\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    863\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    864\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    865\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    866\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    867\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    868\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    870\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    871\u001b[0m \u001b[38;5;66;03m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:579\u001b[0m, in \u001b[0;36mQwen2Model.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **flash_attn_kwargs)\u001b[0m\n\u001b[1;32m    567\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    568\u001b[0m         decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    569\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    576\u001b[0m         position_embeddings,\n\u001b[1;32m    577\u001b[0m     )\n\u001b[1;32m    578\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 579\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    580\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    581\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcausal_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    582\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    583\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    584\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    585\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    586\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    587\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    588\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mflash_attn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    589\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    591\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:260\u001b[0m, in \u001b[0;36mQwen2DecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings, **kwargs)\u001b[0m\n\u001b[1;32m    257\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_layernorm(hidden_states)\n\u001b[1;32m    259\u001b[0m \u001b[38;5;66;03m# Self Attention\u001b[39;00m\n\u001b[0;32m--> 260\u001b[0m hidden_states, self_attn_weights \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    261\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    262\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    263\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    264\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    265\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    266\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    267\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    271\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m residual \u001b[38;5;241m+\u001b[39m hidden_states\n\u001b[1;32m    273\u001b[0m \u001b[38;5;66;03m# Fully Connected\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:192\u001b[0m, in \u001b[0;36mQwen2Attention.forward\u001b[0;34m(self, hidden_states, position_embeddings, attention_mask, past_key_value, cache_position, **kwargs)\u001b[0m\n\u001b[1;32m    189\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    190\u001b[0m         attention_interface \u001b[38;5;241m=\u001b[39m ALL_ATTENTION_FUNCTIONS[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39m_attn_implementation]\n\u001b[0;32m--> 192\u001b[0m attn_output, attn_weights \u001b[38;5;241m=\u001b[39m \u001b[43mattention_interface\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    193\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    194\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquery_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    195\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkey_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    196\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvalue_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    197\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    198\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdropout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention_dropout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    199\u001b[0m \u001b[43m    \u001b[49m\u001b[43mscaling\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscaling\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    200\u001b[0m \u001b[43m    \u001b[49m\u001b[43msliding_window\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msliding_window\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# main diff with Llama\u001b[39;49;00m\n\u001b[1;32m    201\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    202\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    204\u001b[0m attn_output \u001b[38;5;241m=\u001b[39m attn_output\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m*\u001b[39minput_shape, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous()\n\u001b[1;32m    205\u001b[0m attn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mo_proj(attn_output)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:123\u001b[0m, in \u001b[0;36meager_attention_forward\u001b[0;34m(module, query, key, value, attention_mask, scaling, dropout, **kwargs)\u001b[0m\n\u001b[1;32m    121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    122\u001b[0m     causal_mask \u001b[38;5;241m=\u001b[39m attention_mask[:, :, :, : key_states\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m]]\n\u001b[0;32m--> 123\u001b[0m     attn_weights \u001b[38;5;241m=\u001b[39m \u001b[43mattn_weights\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcausal_mask\u001b[49m\n\u001b[1;32m    125\u001b[0m attn_weights \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39msoftmax(attn_weights, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\u001b[38;5;241m.\u001b[39mto(query\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[1;32m    126\u001b[0m attn_weights \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mdropout(attn_weights, p\u001b[38;5;241m=\u001b[39mdropout, training\u001b[38;5;241m=\u001b[39mmodule\u001b[38;5;241m.\u001b[39mtraining)\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (23) must match the size of tensor b (22) at non-singleton dimension 3"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "generated_ids = inputs[\"input_ids\"]\n",
+    "max_new_tokens = 1024\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=inputs[\"attention_mask\"],  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1, keepdim=True)  # 贪心解码\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token], dim=-1)\n",
+    "\n",
+    "    if next_token[:, 0] == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: How does the code handle combinational logic? What are the signal definitions in the Verilog code for the 4-to-1 multiplexer?\n",
+      "The code uses assign statements to handle combinational logic. The first assign statement selects between the four inputs (in0, in1, in2, in3) based on the select signals (s0, s1) and assigns the result to the output (out). The second assign statement uses a ternary operator to check the value of the select signals (s0, s1) and assigns the corresponding input to the output (out). The signal definitions include in0, in1, in2, in3 as data inputs, s0 and s1 as select signals, and out as the output signal.\n",
+      "How does the code handle sequential logic? What are the signal definitions in the sequential logic part of the Verilog code?\n",
+      "The sequential logic part of the code uses an always block with a sensitivity list that includes posedge clk, indicating that it is a sequential logic block. The output (out) is updated on the rising edge of the clock signal (clk). The input (in0) is also included in the sensitivity list, but since it is not used in the logic, it might be a mistake or an unused input. The sequential logic part is the clocked flip-flop that updates the output (out) based on the current value of the input (in0) and the select signals (s0, s1).\n",
+      "What is the function of the circuit described in the Verilog code?\n",
+      "The circuit is a 4-to-1 multiplexer with a registered output. It selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop on the rising edge of the clock signal (clk). The output (out) is the value of the selected input stored in the flip-flop.\n",
+      "How can the circuit be implemented in hardware?\n",
+      "The circuit can be implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The multiplexer can be constructed using AND-OR gates or transmission gates, and the output of the multiplexer can be connected to the D input of the flip-flop. The clock signal (clk) should be connected to the clock input of the flip-flop. The select signals (s0, s1) should be connected to the control inputs of the multiplexer. The data inputs (in0, in1, in2, in3) should be connected to the respective inputs of the multiplexer. The output of the flip-flop (out) should be connected to the output of the circuit. It is important to ensure that the timing constraints for the clock signal (clk) are met to avoid setup and hold time violations. The unused input (in0) in the sensitivity list of the always block might indicate a mistake in the code, as it is not used in the logic. However, it could be a typo or an oversight in the code. The implementation should focus on the functional parts of the circuit, which are the multiplexer and the flip-flop. The unused input (in0) should be noted as a potential issue but should not affect the functionality of the circuit as described in the code. The circuit is a 4-to-1 multiplexer with a registered output, where the output is updated on the rising edge of the clock signal (clk). The multiplexer selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop. The circuit is implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The implementation should focus on the functional parts of the circuit, which are the multiplexer and the flip-flop, while noting the potential issue of the unused input (in0) in the sensitivity list of the always block. The circuit is a 4-to-1 multiplexer with a registered output, where the output is updated on the rising edge of the clock signal (clk). The multiplexer selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop. The circuit is implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The implementation should focus on the functional parts of the circuit, which are the multiplexer and the flip-flop, while noting the potential issue of the unused input (in0) in the sensitivity list of the always block. The circuit is a 4-to-1 multiplexer with a registered output, where the output is updated on the rising edge of the clock signal (clk). The multiplexer selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop. The circuit is implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The implementation should focus on the functional parts of the circuit\n"
+     ]
+    }
+   ],
+   "source": [
+    "generated_ids = inputs[\"input_ids\"]\n",
+    "max_new_tokens = 1024\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=inputs[\"attention_mask\"],  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1, keepdim=True)  # 贪心解码\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token], dim=-1)\n",
+    "\n",
+    "    if next_token[:, 0] == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

.ipynb_checkpoints/graph_train-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1591 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fa17529d-eaa7-473e-9d2d-cc05a0120a51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\" \n",
+    "GRAPH_LENGTH = 512\n",
+    "HF_NAME = \"KSU-HW-SEC/r1q1.5_graph_lora_new\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bba6e6db-4b79-4461-ba13-75fd41019358",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CUDA 可用: True\n",
+      "GPU 数量: 1\n",
+      "当前 GPU: 0\n",
+      "GPU 名称: NVIDIA A100 80GB PCIe\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install transformers accelerate datasets\n",
+    "# !pip install galora\n",
+    "# !pip install huggingface_hub\n",
+    "import torch\n",
+    "print(\"CUDA 可用:\", torch.cuda.is_available())\n",
+    "print(\"GPU 数量:\", torch.cuda.device_count())\n",
+    "print(\"当前 GPU:\", torch.cuda.current_device())\n",
+    "print(\"GPU 名称:\", torch.cuda.get_device_name(torch.cuda.current_device()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ef5551ca-89e2-4488-8e68-1c8d964de039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 1100 + GRAPH_LENGTH  # 最大序列长度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8e283f49-fde4-46e2-9891-dbc304058f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:48: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_081255-v0v96nik</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/v0v96nik' target=\"_blank\">experi0304</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/v0v96nik' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/v0v96nik</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5310' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5310/5310 1:23:11, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>5.349900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>5.305900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>4.849500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>3.910800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>3.325600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>3.144900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>2.904200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>2.082100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>1.214300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>1.011600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>550</td>\n",
+       "      <td>0.889300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>600</td>\n",
+       "      <td>0.907300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>650</td>\n",
+       "      <td>1.190400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>700</td>\n",
+       "      <td>1.889100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>750</td>\n",
+       "      <td>4.505600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>800</td>\n",
+       "      <td>6.402800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>850</td>\n",
+       "      <td>6.479300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>900</td>\n",
+       "      <td>7.337900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>950</td>\n",
+       "      <td>8.937600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>8.938700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1050</td>\n",
+       "      <td>8.860100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1100</td>\n",
+       "      <td>8.693600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1150</td>\n",
+       "      <td>9.234000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1200</td>\n",
+       "      <td>9.347500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1250</td>\n",
+       "      <td>8.010300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1300</td>\n",
+       "      <td>5.952900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1350</td>\n",
+       "      <td>5.205900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1400</td>\n",
+       "      <td>4.969600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1450</td>\n",
+       "      <td>4.884600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>4.934200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1550</td>\n",
+       "      <td>5.156900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1600</td>\n",
+       "      <td>5.115500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1650</td>\n",
+       "      <td>5.373600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1700</td>\n",
+       "      <td>4.481800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1750</td>\n",
+       "      <td>3.957000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1800</td>\n",
+       "      <td>3.092500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1850</td>\n",
+       "      <td>1.791000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1900</td>\n",
+       "      <td>1.934400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1950</td>\n",
+       "      <td>2.176800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>2.112400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2050</td>\n",
+       "      <td>2.127900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2100</td>\n",
+       "      <td>2.390200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2150</td>\n",
+       "      <td>3.091400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2200</td>\n",
+       "      <td>3.959500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2250</td>\n",
+       "      <td>3.905000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2300</td>\n",
+       "      <td>3.777500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2350</td>\n",
+       "      <td>3.282900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2400</td>\n",
+       "      <td>2.630300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2450</td>\n",
+       "      <td>3.705000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>4.266300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2550</td>\n",
+       "      <td>4.285300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2600</td>\n",
+       "      <td>4.634000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2650</td>\n",
+       "      <td>4.474700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2700</td>\n",
+       "      <td>3.591300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2750</td>\n",
+       "      <td>2.486800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2800</td>\n",
+       "      <td>1.911800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2850</td>\n",
+       "      <td>2.088100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2900</td>\n",
+       "      <td>2.015400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2950</td>\n",
+       "      <td>1.988500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>1.976900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3050</td>\n",
+       "      <td>2.097700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3100</td>\n",
+       "      <td>1.987400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3150</td>\n",
+       "      <td>2.065000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3200</td>\n",
+       "      <td>2.112100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3250</td>\n",
+       "      <td>2.075300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3300</td>\n",
+       "      <td>1.983300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3350</td>\n",
+       "      <td>2.181900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3400</td>\n",
+       "      <td>2.446500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3450</td>\n",
+       "      <td>2.434200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>2.357000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3550</td>\n",
+       "      <td>2.157400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3600</td>\n",
+       "      <td>1.992900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3650</td>\n",
+       "      <td>2.018400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3700</td>\n",
+       "      <td>2.010200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3750</td>\n",
+       "      <td>2.009500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3800</td>\n",
+       "      <td>2.034900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3850</td>\n",
+       "      <td>2.038800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3900</td>\n",
+       "      <td>2.007600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3950</td>\n",
+       "      <td>1.983200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>2.005300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4050</td>\n",
+       "      <td>2.014900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4100</td>\n",
+       "      <td>2.018100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4150</td>\n",
+       "      <td>2.033900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4200</td>\n",
+       "      <td>2.024600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4250</td>\n",
+       "      <td>1.995300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4300</td>\n",
+       "      <td>2.018000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4350</td>\n",
+       "      <td>1.998300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4400</td>\n",
+       "      <td>2.032800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4450</td>\n",
+       "      <td>1.985900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4500</td>\n",
+       "      <td>1.967700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4550</td>\n",
+       "      <td>1.989400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4600</td>\n",
+       "      <td>2.004700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4650</td>\n",
+       "      <td>2.005800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4700</td>\n",
+       "      <td>2.014400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4750</td>\n",
+       "      <td>2.009200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4800</td>\n",
+       "      <td>2.002200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4850</td>\n",
+       "      <td>1.914300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4900</td>\n",
+       "      <td>2.016900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4950</td>\n",
+       "      <td>1.972900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>2.010300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5050</td>\n",
+       "      <td>2.046600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5100</td>\n",
+       "      <td>1.993900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5150</td>\n",
+       "      <td>2.084500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5200</td>\n",
+       "      <td>2.011900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5250</td>\n",
+       "      <td>1.996500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5300</td>\n",
+       "      <td>1.997900</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new/commit/231f89403dca9aa67966e4f321e62bdb41076960', commit_message='End of training', commit_description='', oid='231f89403dca9aa67966e4f321e62bdb41076960', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new', endpoint='https://huggingface.co', repo_type='model', repo_id='KSU-HW-SEC/r1q1.5_graph_lora_new'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "\n",
+    "        # self.model = AutoModelForCausalLM.from_config(config)\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "# 1. 加载模型的配置\n",
+    "config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 2. 使用配置创建 GraphAwareLM 实例\n",
+    "model = GraphAwareLM.from_config(config) \n",
+    "\n",
+    "pretrained_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "model.load_state_dict(pretrained_model.state_dict(), strict=False)\n",
+    "\n",
+    "# ✅ 载入修改后的 `GraphAwareLM` 模型\n",
+    "# model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "# model.config.use_sliding_window_attention = False\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    bf16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi0304\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.save_model(\"/workspace/model\")\n",
+    "trainer.push_to_hub()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d2ebf87-402e-444d-8599-96c313f1b7fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 处理后数据条数: 12384\n",
+      "✅ 示例数据: {'input_ids': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]), 'labels': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'graph_embedding': tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+      "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+      "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+      "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+      "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+      "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+      "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+      "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+      "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+      "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+      "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+      "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+      "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+      "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+      "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+      "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+      "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+      "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+      "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+      "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+      "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+      "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+      "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+      "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+      "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+      "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+      "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+      "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+      "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+      "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+      "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+      "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+      "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+      "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+      "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+      "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+      "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+      "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+      "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+      "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+      "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+      "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+      "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+      "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+      "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+      "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+      "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+      "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+      "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+      "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+      "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+      "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+      "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+      "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+      "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+      "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+      "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+      "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+      "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+      "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+      "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+      "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+      "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+      "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635])}\n",
+      "✅ train_data 已保存到 train_data.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n",
+    "\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "\n",
+    "for sample in data:\n",
+    "    conversations = sample.get(\"conversations\", [])\n",
+    "    embeddings = sample.get(\"embedding\", []) \n",
+    "\n",
+    "    if not isinstance(embeddings, list) or len(embeddings) == 0:\n",
+    "        print(f\"无效的 embedding，跳过样本：{sample}\")\n",
+    "        continue\n",
+    "\n",
+    "    graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0)  # [512]\n",
+    "\n",
+    "    #拼接所有对话\n",
+    "    dialogue_text = \"\"\n",
+    "    for conv in conversations:\n",
+    "        role = conv[\"from\"]  # \"human\" 或 \"gpt\"\n",
+    "        content = conv[\"value\"]\n",
+    "        content = content.replace(\"<image>\", \"\") #去掉 <image>\n",
+    "        role_token = ROLE_TOKENS.get(role, f\"<|{role}|>\")  # 兼容性处理\n",
+    "        dialogue_text += f\"{role_token} {content}\\n\"\n",
+    "\n",
+    "    tokenized = tokenizer(\n",
+    "        dialogue_text,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=max_seq_length - GRAPH_LENGTH,  # 预留 graph embedding 空间\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    input_ids = tokenized[\"input_ids\"].squeeze(0)\n",
+    "    attention_mask = tokenized[\"attention_mask\"].squeeze(0)\n",
+    "\n",
+    "    train_data.append({\n",
+    "        \"input_ids\": input_ids,\n",
+    "        \"attention_mask\": attention_mask,\n",
+    "        \"labels\": input_ids.clone(),\n",
+    "        \"graph_embedding\": graph_embedding,  # `graph_embedding` 存入\n",
+    "    })\n",
+    "\n",
+    "print(\"🚀 处理后数据条数:\", len(train_data))\n",
+    "print(\"✅ 示例数据:\", train_data[0])\n",
+    "torch.save(train_data, \"train_data.pt\")\n",
+    "print(\"✅ train_data 已保存到 train_data.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a33bffb9-2ff9-4a4d-af2c-b89b30a69f7d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:49: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_074031-ofm5zhvd</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/ofm5zhvd' target=\"_blank\">experi0304</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/ofm5zhvd' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/ofm5zhvd</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='89' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [  89/5310 01:06 < 1:06:24, 1.31 it/s, Epoch 0.05/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 150\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;66;03m# ✅ 训练\u001b[39;00m\n\u001b[1;32m    144\u001b[0m trainer \u001b[38;5;241m=\u001b[39m GraphTrainer(\n\u001b[1;32m    145\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    146\u001b[0m     args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[1;32m    147\u001b[0m     train_dataset\u001b[38;5;241m=\u001b[39mtrain_dataset,\n\u001b[1;32m    148\u001b[0m )\n\u001b[0;32m--> 150\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    151\u001b[0m trainer\u001b[38;5;241m.\u001b[39mpush_to_hub()\n\u001b[1;32m    152\u001b[0m trainer\u001b[38;5;241m.\u001b[39msave_model(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/workspace/model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2232\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   2229\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   2230\u001b[0m     \u001b[38;5;66;03m# Disable progress bars when uploading models during checkpoints to avoid polluting stdout\u001b[39;00m\n\u001b[1;32m   2231\u001b[0m     hf_hub_utils\u001b[38;5;241m.\u001b[39mdisable_progress_bars()\n\u001b[0;32m-> 2232\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2233\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2234\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2235\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2236\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2237\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2238\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m   2239\u001b[0m     hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2548\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2541\u001b[0m context \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   2542\u001b[0m     functools\u001b[38;5;241m.\u001b[39mpartial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mno_sync, model\u001b[38;5;241m=\u001b[39mmodel)\n\u001b[1;32m   2543\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch_samples) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m   2544\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m!=\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mDEEPSPEED\n\u001b[1;32m   2545\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m contextlib\u001b[38;5;241m.\u001b[39mnullcontext\n\u001b[1;32m   2546\u001b[0m )\n\u001b[1;32m   2547\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[0;32m-> 2548\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2550\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2551\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2552\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2553\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2554\u001b[0m ):\n\u001b[1;32m   2555\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2556\u001b[0m     tr_loss \u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m+\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3740\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m   3737\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m==\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mDEEPSPEED:\n\u001b[1;32m   3738\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscale_wrt_gas\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m-> 3740\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccelerator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mloss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\u001b[38;5;241m.\u001b[39mdetach()\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py:2325\u001b[0m, in \u001b[0;36mAccelerator.backward\u001b[0;34m(self, loss, **kwargs)\u001b[0m\n\u001b[1;32m   2323\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m   2324\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscaler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 2325\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscaler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale\u001b[49m\u001b[43m(\u001b[49m\u001b[43mloss\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2326\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m learning_rate \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_lomo_optimizer:\n\u001b[1;32m   2327\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlomo_backward(loss, learning_rate)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/_tensor.py:492\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    482\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    483\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    484\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    485\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    490\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    491\u001b[0m     )\n\u001b[0;32m--> 492\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py:251\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    246\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m    248\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m    249\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    250\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 251\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    252\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    253\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    254\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    255\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    256\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    257\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    258\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    259\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(config)\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "\n",
+    "# ✅ 载入修改后的 `GraphAwareLM` 模型\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "# model.config.use_sliding_window_attention = False\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    fp16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi0304\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.push_to_hub()\n",
+    "trainer.save_model(\"/workspace/model\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "05a48aa8-c597-4ff1-9569-aa210f4f1f5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "\n",
+    "    \n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding.squeeze(0))  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "        model.graph_proj = nn.Linear(512, model.config.hidden_size)\n",
+    "        return model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "73ae15d9-c9d9-4e64-ac8b-2d5877eac984",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "21c8df04-0dc2-436c-aaaf-74a885f734d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Qwen2ForCausalLM(\n",
+       "  (model): Qwen2Model(\n",
+       "    (embed_tokens): Embedding(151936, 1536)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-27): 28 x Qwen2DecoderLayer(\n",
+       "        (self_attn): Qwen2Attention(\n",
+       "          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
+       "        )\n",
+       "        (mlp): Qwen2MLP(\n",
+       "          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "    (rotary_emb): Qwen2RotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
+       "  (graph_proj): Linear(in_features=512, out_features=1536, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model\"\n",
+    "model = GraphAwareLM.from_pretrained(model_path).to(device)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7a8562c0-8d55-4412-8f89-de20bae0f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n",
+    "\n",
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32).to(device)\n",
+    "\n",
+    "question1 = conversations[4][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "attention_mask = inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62f40327-f102-4259-80a5-8761d5d7d3c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "         -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "         -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "         -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "         -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "          0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "         -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "         -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "         -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "         -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "          0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "          0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "         -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "          1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "         -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "         -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "         -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "         -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "         -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "          0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "          0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "         -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "          1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "          0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "         -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "         -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "         -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "         -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "         -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "          2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "         -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "         -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "         -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "          0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "         -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "         -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "          1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "         -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "         -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "         -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "         -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "         -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "          1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "          0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "         -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "          1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "         -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "          0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "         -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "         -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "          2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "         -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "          0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "         -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "         -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "          0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "          0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "         -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "          0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "         -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "         -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "         -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "         -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "          1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "067a0cf7-3010-4b6b-b2aa-d4ce95010d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "模型回复：  How\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ✅ 进行前向传播\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, graph_embedding=graph_embedding)\n",
+    "\n",
+    "# ✅ 提取 logits 并进行贪心解码\n",
+    "logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "predicted_id = torch.argmax(logits, dim=-1)  # 选择概率最大的 token\n",
+    "\n",
+    "# ✅ 反向编码为文本\n",
+    "response_text = tokenizer.decode(predicted_id, skip_special_tokens=True)\n",
+    "\n",
+    "print(\"模型回复：\", response_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ae38ed68-bc6a-4bc3-aee8-d54d2dd689ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: Is there any sequential logic in the module, and if so, how is it handled? What are the module's inputs and outputs?\n",
+      "What are the module's inputs and outputs?\n",
+      "What are the module's inputs and outputs?\n",
+      "What are the module's inputs and outputs?\n",
+      "What is the module's input, and what is the module's output, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's output, and what is the module's output, and what is the module's output, and what is the module's output, and module's output, and module's input, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_new_tokens = 1024\n",
+    "generated_ids = input_ids.clone()\n",
+    "generated_attention_mask = attention_mask.clone()\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=generated_attention_mask,  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1)  # 贪心解码\n",
+    "    # print(next_token)\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token.unsqueeze(1)], dim=1)\n",
+    "\n",
+    "    # print(generated_ids)\n",
+    "\n",
+    "    if next_token.item() == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "    generated_attention_mask = torch.cat(\n",
+    "        [generated_attention_mask, torch.ones((1, 1), dtype=generated_attention_mask.dtype, device=generated_attention_mask.device)], dim=1\n",
+    "    ) \n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "803f41fe-f504-4c2a-96b4-afc2cd437d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[151646,   3838,    525,    279,   8286,  17473,    304,    279,   6250,\n",
+       "          50773,   2038,    369,    279,  29952,   4688,     11,    323,   1128,\n",
+       "            525,    862,   9895,     30]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d1396b-4d20-4a76-a092-b26a587a76ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/graph_train2-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1674 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fa17529d-eaa7-473e-9d2d-cc05a0120a51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\" \n",
+    "GRAPH_LENGTH = 512\n",
+    "HF_NAME = \"KSU-HW-SEC/r1q1.5_graph_lora_new2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bba6e6db-4b79-4461-ba13-75fd41019358",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CUDA 可用: True\n",
+      "GPU 数量: 1\n",
+      "当前 GPU: 0\n",
+      "GPU 名称: NVIDIA A100 80GB PCIe\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install transformers accelerate datasets\n",
+    "# !pip install galora\n",
+    "# !pip install huggingface_hub\n",
+    "import torch\n",
+    "print(\"CUDA 可用:\", torch.cuda.is_available())\n",
+    "print(\"GPU 数量:\", torch.cuda.device_count())\n",
+    "print(\"当前 GPU:\", torch.cuda.current_device())\n",
+    "print(\"GPU 名称:\", torch.cuda.get_device_name(torch.cuda.current_device()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ef5551ca-89e2-4488-8e68-1c8d964de039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 1100 + GRAPH_LENGTH  # 最大序列长度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8e283f49-fde4-46e2-9891-dbc304058f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:48: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_111730-i9v1vlu1</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/i9v1vlu1' target=\"_blank\">experi030402</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/i9v1vlu1' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/i9v1vlu1</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5310' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5310/5310 1:34:08, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>5.319300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>3.641300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>1.521800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>1.027500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>0.922400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>0.866900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>0.800500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>0.721600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>0.740400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.737000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>550</td>\n",
+       "      <td>0.713500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>600</td>\n",
+       "      <td>0.747000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>650</td>\n",
+       "      <td>0.869500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>700</td>\n",
+       "      <td>1.473300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>750</td>\n",
+       "      <td>0.753000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>800</td>\n",
+       "      <td>0.741300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>850</td>\n",
+       "      <td>0.751400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>900</td>\n",
+       "      <td>0.787600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>950</td>\n",
+       "      <td>0.783200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.780200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1050</td>\n",
+       "      <td>1.012900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1100</td>\n",
+       "      <td>1.411700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1150</td>\n",
+       "      <td>1.536400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1200</td>\n",
+       "      <td>0.853800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1250</td>\n",
+       "      <td>0.756500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1300</td>\n",
+       "      <td>0.750800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1350</td>\n",
+       "      <td>0.747400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1400</td>\n",
+       "      <td>0.844400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1450</td>\n",
+       "      <td>0.858400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>1.053400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1550</td>\n",
+       "      <td>1.591600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1600</td>\n",
+       "      <td>1.498900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1650</td>\n",
+       "      <td>1.471700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1700</td>\n",
+       "      <td>1.221100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1750</td>\n",
+       "      <td>1.802300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1800</td>\n",
+       "      <td>1.826000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1850</td>\n",
+       "      <td>1.857300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1900</td>\n",
+       "      <td>1.561800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1950</td>\n",
+       "      <td>1.398800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>1.398900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2050</td>\n",
+       "      <td>1.381600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2100</td>\n",
+       "      <td>0.890300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2150</td>\n",
+       "      <td>0.763700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2200</td>\n",
+       "      <td>0.753100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2250</td>\n",
+       "      <td>0.745500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2300</td>\n",
+       "      <td>1.186100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2350</td>\n",
+       "      <td>0.862000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2400</td>\n",
+       "      <td>1.024600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2450</td>\n",
+       "      <td>1.028400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>1.008500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2550</td>\n",
+       "      <td>0.942800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2600</td>\n",
+       "      <td>0.849700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2650</td>\n",
+       "      <td>0.771400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2700</td>\n",
+       "      <td>0.794100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2750</td>\n",
+       "      <td>0.819200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2800</td>\n",
+       "      <td>0.937500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2850</td>\n",
+       "      <td>1.064500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2900</td>\n",
+       "      <td>1.189300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2950</td>\n",
+       "      <td>1.071100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>1.003300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3050</td>\n",
+       "      <td>1.073900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3100</td>\n",
+       "      <td>1.043100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3150</td>\n",
+       "      <td>1.282600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3200</td>\n",
+       "      <td>2.145400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3250</td>\n",
+       "      <td>1.925800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3300</td>\n",
+       "      <td>2.005600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3350</td>\n",
+       "      <td>2.122600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3400</td>\n",
+       "      <td>2.163000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3450</td>\n",
+       "      <td>2.046600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>2.152200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3550</td>\n",
+       "      <td>2.151700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3600</td>\n",
+       "      <td>5.394900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3650</td>\n",
+       "      <td>4.677800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3700</td>\n",
+       "      <td>4.122200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3750</td>\n",
+       "      <td>3.710200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3800</td>\n",
+       "      <td>3.350800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3850</td>\n",
+       "      <td>3.126300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3900</td>\n",
+       "      <td>2.988700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3950</td>\n",
+       "      <td>2.872000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>2.848200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4050</td>\n",
+       "      <td>2.823900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4100</td>\n",
+       "      <td>2.781200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4150</td>\n",
+       "      <td>2.735000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4200</td>\n",
+       "      <td>2.725900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4250</td>\n",
+       "      <td>2.644400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4300</td>\n",
+       "      <td>2.700000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4350</td>\n",
+       "      <td>2.650100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4400</td>\n",
+       "      <td>2.704500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4450</td>\n",
+       "      <td>2.596700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4500</td>\n",
+       "      <td>2.510500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4550</td>\n",
+       "      <td>2.515800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4600</td>\n",
+       "      <td>2.498100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4650</td>\n",
+       "      <td>2.458900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4700</td>\n",
+       "      <td>2.449700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4750</td>\n",
+       "      <td>2.425000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4800</td>\n",
+       "      <td>2.362300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4850</td>\n",
+       "      <td>2.232000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4900</td>\n",
+       "      <td>2.361500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4950</td>\n",
+       "      <td>2.302300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>2.333900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5050</td>\n",
+       "      <td>2.367200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5100</td>\n",
+       "      <td>2.288300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5150</td>\n",
+       "      <td>2.426100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5200</td>\n",
+       "      <td>2.344100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5250</td>\n",
+       "      <td>2.283500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5300</td>\n",
+       "      <td>2.296500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new2/commit/291285a5f2155c79a0da893645d8df9bbca98f63', commit_message='End of training', commit_description='', oid='291285a5f2155c79a0da893645d8df9bbca98f63', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new2', endpoint='https://huggingface.co', repo_type='model', repo_id='KSU-HW-SEC/r1q1.5_graph_lora_new2'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 `MODEL_NAME` 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "# ✅ 载入微调模型\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# # 1. 加载模型的配置\n",
+    "# config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# # 2. 使用配置创建 GraphAwareLM 实例\n",
+    "# model = GraphAwareLM.from_config(config) \n",
+    "\n",
+    "# pretrained_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "# model.load_state_dict(pretrained_model.state_dict(), strict=False)\n",
+    "\n",
+    "# ✅ 载入修改后的 `GraphAwareLM` 模型\n",
+    "# model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "# model.config.use_sliding_window_attention = False\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results2\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    bf16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi030402\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.save_model(\"/workspace/model2\")\n",
+    "trainer.push_to_hub()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7a72ac3b-561e-41d3-ae93-99f20acf3188",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora-wandb', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora-wandb')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()\n",
+    "repo_name = \"r1q1.5_graph_lora-wandb\"  # 你的模型名称\n",
+    "api.create_repo(repo_name, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "73c434b9-5d58-4819-8526-24aa18ca1010",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "727ca342a20348d38a4a1c6d286963e0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "optimizer.pt:   0%|          | 0.00/4.32G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d4967e00603441091d4527bccf89e43",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "71abe1a9a91a4851bdd941995145dc8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 15 LFS files:   0%|          | 0/15 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "59323569e2fb415bbdbb006b3ad3bceb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8855625556094031ba6b14bbd9a062b1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1f85411d4ac43ffa27ed9d24c18f468",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e52bc8f9a5f34887a3844215e4fdfde2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10d59eeefafe474488b7972ccf3ca70a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dfc1c91010114f09b760450f1b998aa4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ebc84de361240ad9317c70342168308",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "optimizer.pt:   0%|          | 0.00/4.32G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4f6e8853b6ff41e88b967ae9c13a81ab",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "75eb5a5e71c749b5945b66e2d41c441c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d748aea8d80c4f81a36633a71ce0a0f7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1a84a5ec38f459bab4791065dc8efc3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ae50659ac6024e5c8fc43ce898982f0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c109c00c979d41d59fa48d4314bac1e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/YiFzhao/r1q1.5_graph_lora-results3/commit/5a14bfa05e9bd78cd030104d0e9ff02638731668', commit_message='upload results3', commit_description='', oid='5a14bfa05e9bd78cd030104d0e9ff02638731668', pr_url=None, repo_url=RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora-results3', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora-results3'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import upload_folder\n",
+    "\n",
+    "upload_folder(\n",
+    "    folder_path = \"/workspace/wandb\",\n",
+    "    repo_id = \"YiFzhao/r1q1.5_graph_lora-wandb\",\n",
+    "    commit_message = \"upload wandb\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d2ebf87-402e-444d-8599-96c313f1b7fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 处理后数据条数: 12384\n",
+      "✅ 示例数据: {'input_ids': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]), 'labels': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'graph_embedding': tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+      "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+      "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+      "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+      "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+      "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+      "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+      "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+      "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+      "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+      "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+      "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+      "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+      "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+      "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+      "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+      "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+      "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+      "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+      "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+      "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+      "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+      "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+      "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+      "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+      "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+      "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+      "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+      "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+      "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+      "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+      "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+      "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+      "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+      "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+      "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+      "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+      "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+      "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+      "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+      "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+      "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+      "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+      "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+      "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+      "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+      "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+      "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+      "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+      "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+      "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+      "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+      "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+      "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+      "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+      "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+      "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+      "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+      "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+      "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+      "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+      "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+      "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+      "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635])}\n",
+      "✅ train_data 已保存到 train_data.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n",
+    "\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "\n",
+    "for sample in data:\n",
+    "    conversations = sample.get(\"conversations\", [])\n",
+    "    embeddings = sample.get(\"embedding\", []) \n",
+    "\n",
+    "    if not isinstance(embeddings, list) or len(embeddings) == 0:\n",
+    "        print(f\"无效的 embedding，跳过样本：{sample}\")\n",
+    "        continue\n",
+    "\n",
+    "    graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0)  # [512]\n",
+    "\n",
+    "    #拼接���有对话\n",
+    "    dialogue_text = \"\"\n",
+    "    for conv in conversations:\n",
+    "        role = conv[\"from\"]  # \"human\" 或 \"gpt\"\n",
+    "        content = conv[\"value\"]\n",
+    "        content = content.replace(\"<image>\", \"\") #去掉 <image>\n",
+    "        role_token = ROLE_TOKENS.get(role, f\"<|{role}|>\")  # 兼容性处理\n",
+    "        dialogue_text += f\"{role_token} {content}\\n\"\n",
+    "\n",
+    "    tokenized = tokenizer(\n",
+    "        dialogue_text,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=max_seq_length - GRAPH_LENGTH,  # 预留 graph embedding 空间\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    input_ids = tokenized[\"input_ids\"].squeeze(0)\n",
+    "    attention_mask = tokenized[\"attention_mask\"].squeeze(0)\n",
+    "\n",
+    "    train_data.append({\n",
+    "        \"input_ids\": input_ids,\n",
+    "        \"attention_mask\": attention_mask,\n",
+    "        \"labels\": input_ids.clone(),\n",
+    "        \"graph_embedding\": graph_embedding,  # `graph_embedding` 存入\n",
+    "    })\n",
+    "\n",
+    "print(\"🚀 处理后数据条数:\", len(train_data))\n",
+    "print(\"✅ 示例数据:\", train_data[0])\n",
+    "torch.save(train_data, \"train_data.pt\")\n",
+    "print(\"✅ train_data 已保存到 train_data.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "05a48aa8-c597-4ff1-9569-aa210f4f1f5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 `MODEL_NAME` 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    def generate_with_graph(self, inputs, graph_embedding, max_length=500, temperature=0.7, top_k=50, top_p=0.9):\n",
+    "        \"\"\"\n",
+    "        ✅ 自定义 `generate()`，支持 `graph_embedding`\n",
+    "        `input_text`: 需要生成文本的输入\n",
+    "        `graph_embedding`: 形状为 (1, 512) 的张量\n",
+    "        \"\"\"\n",
+    "        # ✅ 2. 处理 `graph_embedding`\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (1, hidden_size)\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (1, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 3. 获取 Token Embeddings 并拼接\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(inputs[\"input_ids\"])  # (1, seq_len, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (1, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 调整 `attention_mask`\n",
+    "        if \"attention_mask\" in inputs:\n",
+    "            graph_mask = torch.ones((inputs[\"attention_mask\"].shape[0], 1), device=inputs[\"attention_mask\"].device, dtype=inputs[\"attention_mask\"].dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, inputs[\"attention_mask\"]], dim=1)  # (1, seq_len+1)\n",
+    "        else:\n",
+    "            attention_mask = None\n",
+    "\n",
+    "        # ✅ 5. 进行文本生成\n",
+    "        with torch.no_grad():\n",
+    "            output_ids = self.model.generate(\n",
+    "                inputs_embeds=inputs_embeds,\n",
+    "                attention_mask=attention_mask,\n",
+    "                max_length=max_length,\n",
+    "                temperature=temperature,\n",
+    "                top_k=top_k,\n",
+    "                top_p=top_p,\n",
+    "                num_return_sequences=1\n",
+    "            )\n",
+    "\n",
+    "        # ✅ 6. 解码生成的文本\n",
+    "        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "        return generated_text\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "        model.graph_proj = nn.Linear(512, model.config.hidden_size)\n",
+    "        return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "73ae15d9-c9d9-4e64-ac8b-2d5877eac984",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "21c8df04-0dc2-436c-aaaf-74a885f734d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ad289c5523340f39799ad11e3bc1bb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Qwen2ForCausalLM(\n",
+       "  (model): Qwen2Model(\n",
+       "    (embed_tokens): Embedding(151936, 1536)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-27): 28 x Qwen2DecoderLayer(\n",
+       "        (self_attn): Qwen2Attention(\n",
+       "          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
+       "        )\n",
+       "        (mlp): Qwen2MLP(\n",
+       "          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "    (rotary_emb): Qwen2RotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
+       "  (graph_proj): Linear(in_features=512, out_features=1536, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model2\"\n",
+    "model = GraphAwareLM.from_pretrained(\"/workspace/results2/checkpoint-5310\").to(device)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "51995891-8906-4049-9401-2d22e06a84e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([[-0.0380, -0.0350, -0.0423,  ...,  0.0213,  0.0148, -0.0047],\n",
+      "        [ 0.0131,  0.0388, -0.0378,  ...,  0.0399, -0.0309, -0.0342],\n",
+      "        [ 0.0084, -0.0116,  0.0259,  ...,  0.0344,  0.0268, -0.0062],\n",
+      "        ...,\n",
+      "        [ 0.0080, -0.0073, -0.0023,  ..., -0.0120,  0.0387,  0.0209],\n",
+      "        [ 0.0277,  0.0326,  0.0270,  ...,  0.0124, -0.0348,  0.0389],\n",
+      "        [ 0.0184, -0.0410, -0.0415,  ...,  0.0255, -0.0429, -0.0386]],\n",
+      "       device='cuda:0', requires_grad=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.graph_proj.weight)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7a8562c0-8d55-4412-8f89-de20bae0f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n",
+    "\n",
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0).to(device)\n",
+    "\n",
+    "question1 = conversations[4][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "attention_mask = inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "4bd7493f-ca8d-4c28-914d-95b1c30f8fcc",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Qwen2ForCausalLM' object has no attribute 'generate_with_graph'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m generated_text \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_with_graph\u001b[49m(inputs, graph_embedding)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1695\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   1693\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m   1694\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1695\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Qwen2ForCausalLM' object has no attribute 'generate_with_graph'"
+     ]
+    }
+   ],
+   "source": [
+    "generated_text = model.generate_with_graph(inputs, graph_embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62f40327-f102-4259-80a5-8761d5d7d3c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "067a0cf7-3010-4b6b-b2aa-d4ce95010d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "模型回复：  How\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ✅ 进行前向传播\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, graph_embedding=graph_embedding)\n",
+    "\n",
+    "# ✅ 提取 logits 并进行贪心解码\n",
+    "logits = outputs.logits[:, -1, :]  # ��最后一个 token 的 logits\n",
+    "predicted_id = torch.argmax(logits, dim=-1)  # 选择概率最大的 token\n",
+    "\n",
+    "# ✅ 反向编码为文本\n",
+    "response_text = tokenizer.decode(predicted_id, skip_special_tokens=True)\n",
+    "\n",
+    "print(\"模型回复：\", response_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "ae38ed68-bc6a-4bc3-aee8-d54d2dd689ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: Is there any sequential logic in the module, and if so, how is it handled? `data` is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit data, and the output is the output of the `data` is a 1-bit\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_new_tokens = 1024\n",
+    "generated_ids = input_ids.clone()\n",
+    "generated_attention_mask = attention_mask.clone()\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=generated_attention_mask,  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1)  # 贪心解码\n",
+    "    # print(next_token)\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token.unsqueeze(1)], dim=1)\n",
+    "\n",
+    "    # print(generated_ids)\n",
+    "\n",
+    "    if next_token.item() == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "    generated_attention_mask = torch.cat(\n",
+    "        [generated_attention_mask, torch.ones((1, 1), dtype=generated_attention_mask.dtype, device=generated_attention_mask.device)], dim=1\n",
+    "    ) \n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "803f41fe-f504-4c2a-96b4-afc2cd437d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[151646,   3838,    525,    279,   8286,  17473,    304,    279,   6250,\n",
+       "          50773,   2038,    369,    279,  29952,   4688,     11,    323,   1128,\n",
+       "            525,    862,   9895,     30]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d1396b-4d20-4a76-a092-b26a587a76ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/graph_train3-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1588 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fa17529d-eaa7-473e-9d2d-cc05a0120a51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\" \n",
+    "GRAPH_LENGTH = 512\n",
+    "HF_NAME = \"KSU-HW-SEC/r1q1.5_graph_lora_new3\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bba6e6db-4b79-4461-ba13-75fd41019358",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CUDA 可用: True\n",
+      "GPU 数量: 1\n",
+      "当前 GPU: 0\n",
+      "GPU 名称: NVIDIA A100 80GB PCIe\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install transformers accelerate datasets\n",
+    "# !pip install galora\n",
+    "# !pip install huggingface_hub\n",
+    "import torch\n",
+    "print(\"CUDA 可用:\", torch.cuda.is_available())\n",
+    "print(\"GPU 数量:\", torch.cuda.device_count())\n",
+    "print(\"当前 GPU:\", torch.cuda.current_device())\n",
+    "print(\"GPU 名称:\", torch.cuda.get_device_name(torch.cuda.current_device()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ef5551ca-89e2-4488-8e68-1c8d964de039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 1100 + GRAPH_LENGTH  # 最大序列长度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8e283f49-fde4-46e2-9891-dbc304058f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:48: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_134403-e0v0giuw</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/e0v0giuw' target=\"_blank\">experi030403</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/e0v0giuw' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/e0v0giuw</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5310' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5310/5310 1:33:59, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>5.319300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>3.641300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>1.521800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>1.027500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>0.922400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>0.866900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>0.800500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>0.721600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>0.740400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.737000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>550</td>\n",
+       "      <td>0.713500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>600</td>\n",
+       "      <td>0.747000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>650</td>\n",
+       "      <td>0.869500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>700</td>\n",
+       "      <td>1.473300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>750</td>\n",
+       "      <td>0.753000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>800</td>\n",
+       "      <td>0.741300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>850</td>\n",
+       "      <td>0.751400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>900</td>\n",
+       "      <td>0.787600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>950</td>\n",
+       "      <td>0.783200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.780200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1050</td>\n",
+       "      <td>1.012900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1100</td>\n",
+       "      <td>1.411700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1150</td>\n",
+       "      <td>1.536400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1200</td>\n",
+       "      <td>0.853800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1250</td>\n",
+       "      <td>0.756500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1300</td>\n",
+       "      <td>0.750800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1350</td>\n",
+       "      <td>0.747400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1400</td>\n",
+       "      <td>0.844400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1450</td>\n",
+       "      <td>0.858400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>1.053400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1550</td>\n",
+       "      <td>1.591600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1600</td>\n",
+       "      <td>1.498900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1650</td>\n",
+       "      <td>1.471700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1700</td>\n",
+       "      <td>1.221100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1750</td>\n",
+       "      <td>1.802300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1800</td>\n",
+       "      <td>1.826000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1850</td>\n",
+       "      <td>1.857300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1900</td>\n",
+       "      <td>1.561800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1950</td>\n",
+       "      <td>1.398800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>1.398900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2050</td>\n",
+       "      <td>1.381600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2100</td>\n",
+       "      <td>0.890300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2150</td>\n",
+       "      <td>0.763700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2200</td>\n",
+       "      <td>0.753100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2250</td>\n",
+       "      <td>0.745500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2300</td>\n",
+       "      <td>1.186100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2350</td>\n",
+       "      <td>0.862000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2400</td>\n",
+       "      <td>1.024600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2450</td>\n",
+       "      <td>1.028400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>1.008500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2550</td>\n",
+       "      <td>0.942800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2600</td>\n",
+       "      <td>0.849700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2650</td>\n",
+       "      <td>0.771400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2700</td>\n",
+       "      <td>0.794100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2750</td>\n",
+       "      <td>0.819200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2800</td>\n",
+       "      <td>0.937500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2850</td>\n",
+       "      <td>1.064500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2900</td>\n",
+       "      <td>1.189300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2950</td>\n",
+       "      <td>1.071100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>1.003300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3050</td>\n",
+       "      <td>1.073900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3100</td>\n",
+       "      <td>1.043100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3150</td>\n",
+       "      <td>1.282600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3200</td>\n",
+       "      <td>2.145400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3250</td>\n",
+       "      <td>1.925800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3300</td>\n",
+       "      <td>2.005600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3350</td>\n",
+       "      <td>2.122600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3400</td>\n",
+       "      <td>2.163000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3450</td>\n",
+       "      <td>2.046600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>2.152200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3550</td>\n",
+       "      <td>2.151700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3600</td>\n",
+       "      <td>5.394900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3650</td>\n",
+       "      <td>4.677800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3700</td>\n",
+       "      <td>4.122200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3750</td>\n",
+       "      <td>3.710200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3800</td>\n",
+       "      <td>3.350800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3850</td>\n",
+       "      <td>3.126300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3900</td>\n",
+       "      <td>2.988700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3950</td>\n",
+       "      <td>2.872000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>2.848200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4050</td>\n",
+       "      <td>2.823900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4100</td>\n",
+       "      <td>2.781200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4150</td>\n",
+       "      <td>2.735000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4200</td>\n",
+       "      <td>2.725900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4250</td>\n",
+       "      <td>2.644400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4300</td>\n",
+       "      <td>2.700000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4350</td>\n",
+       "      <td>2.650100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4400</td>\n",
+       "      <td>2.704500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4450</td>\n",
+       "      <td>2.596700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4500</td>\n",
+       "      <td>2.510500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4550</td>\n",
+       "      <td>2.515800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4600</td>\n",
+       "      <td>2.498100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4650</td>\n",
+       "      <td>2.458900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4700</td>\n",
+       "      <td>2.449700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4750</td>\n",
+       "      <td>2.425000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4800</td>\n",
+       "      <td>2.362300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4850</td>\n",
+       "      <td>2.232000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4900</td>\n",
+       "      <td>2.361500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4950</td>\n",
+       "      <td>2.302300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>2.333900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5050</td>\n",
+       "      <td>2.367200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5100</td>\n",
+       "      <td>2.288300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5150</td>\n",
+       "      <td>2.426100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5200</td>\n",
+       "      <td>2.344100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5250</td>\n",
+       "      <td>2.283500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5300</td>\n",
+       "      <td>2.296500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new3/commit/b9472b66316be8654c6f7c173fa4561889bd3446', commit_message='End of training', commit_description='', oid='b9472b66316be8654c6f7c173fa4561889bd3446', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new3', endpoint='https://huggingface.co', repo_type='model', repo_id='KSU-HW-SEC/r1q1.5_graph_lora_new3'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path, num_heads=8):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 LLM 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        # ✅ 1. 线性变换，将 `graph_embedding` 从 512 维映射到 `hidden_size`\n",
+    "        self.linear1 = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力层\n",
+    "        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.config.hidden_size, num_heads=num_heads, batch_first=True)\n",
+    "\n",
+    "        # ✅ 3. 线性变换\n",
+    "        self.linear2 = nn.Linear(self.config.hidden_size, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 残差连接 + LayerNorm\n",
+    "        self.norm = nn.LayerNorm(self.config.hidden_size)\n",
+    "    \n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 1. 线性变换 `graph_embedding`\n",
+    "        graph_embedding_token = self.linear1(graph_embedding)  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力计算（自注意力机制）\n",
+    "        attn_output, _ = self.multihead_attn(graph_embedding_token, graph_embedding_token, graph_embedding_token)\n",
+    "        \n",
+    "        # ✅ 3. 线性层 + 残差连接\n",
+    "        graph_embedding_token = self.linear2(attn_output) + graph_embedding_token  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 归一化\n",
+    "        graph_embedding_token = self.norm(graph_embedding_token)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "# ✅ 载入微调模型\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results3\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    bf16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi030403\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.save_model(\"/workspace/model3\")\n",
+    "trainer.push_to_hub()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7a72ac3b-561e-41d3-ae93-99f20acf3188",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora_new2-3000', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora_new2-3000')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()\n",
+    "repo_name = \"r1q1.5_graph_lora-results3\"  # 你的模型名称\n",
+    "api.create_repo(repo_name, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "73c434b9-5d58-4819-8526-24aa18ca1010",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b896f21685e4086b0b59404b2b1a866",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d20bff067ca44c4583378181da817897",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c4b7114a53b341539a3244f2eea8aacf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74c6045017b640bdba86fe3ed1bb9c92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97436b084bc4420f8b273ec462c50e61",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "optimizer.pt:   0%|          | 0.00/4.32G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7f10ccff3674e6fa8bcb42553c12b19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c5b1a010fd0845f9ba9112291afa8f17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/YiFzhao/r1q1.5_graph_lora_new2-3000/commit/4088de651a0ce2cc39fcb0c950898e54ce91bdea', commit_message='upload checkpoint-3000', commit_description='', oid='4088de651a0ce2cc39fcb0c950898e54ce91bdea', pr_url=None, repo_url=RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora_new2-3000', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora_new2-3000'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import upload_folder\n",
+    "\n",
+    "upload_folder(\n",
+    "    folder_path = \"/workspace/results3\",\n",
+    "    repo_id = \"YiFzhao/r1q1.5_graph_lora-results3\",\n",
+    "    commit_message = \"upload results2\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d2ebf87-402e-444d-8599-96c313f1b7fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 处理后数据条数: 12384\n",
+      "✅ 示例数据: {'input_ids': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]), 'labels': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'graph_embedding': tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+      "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+      "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+      "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+      "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+      "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+      "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+      "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+      "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+      "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+      "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+      "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+      "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+      "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+      "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+      "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+      "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+      "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+      "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+      "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+      "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+      "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+      "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+      "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+      "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+      "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+      "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+      "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+      "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+      "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+      "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+      "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+      "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+      "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+      "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+      "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+      "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+      "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+      "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+      "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+      "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+      "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+      "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+      "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+      "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+      "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+      "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+      "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+      "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+      "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+      "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+      "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+      "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+      "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+      "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+      "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+      "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+      "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+      "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+      "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+      "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+      "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+      "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+      "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635])}\n",
+      "✅ train_data 已保存到 train_data.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n",
+    "\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "\n",
+    "for sample in data:\n",
+    "    conversations = sample.get(\"conversations\", [])\n",
+    "    embeddings = sample.get(\"embedding\", []) \n",
+    "\n",
+    "    if not isinstance(embeddings, list) or len(embeddings) == 0:\n",
+    "        print(f\"无效的 embedding，跳过样本：{sample}\")\n",
+    "        continue\n",
+    "\n",
+    "    graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0)  # [512]\n",
+    "\n",
+    "    #拼接所有对话\n",
+    "    dialogue_text = \"\"\n",
+    "    for conv in conversations:\n",
+    "        role = conv[\"from\"]  # \"human\" 或 \"gpt\"\n",
+    "        content = conv[\"value\"]\n",
+    "        content = content.replace(\"<image>\", \"\") #去掉 <image>\n",
+    "        role_token = ROLE_TOKENS.get(role, f\"<|{role}|>\")  # 兼容性处理\n",
+    "        dialogue_text += f\"{role_token} {content}\\n\"\n",
+    "\n",
+    "    tokenized = tokenizer(\n",
+    "        dialogue_text,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=max_seq_length - GRAPH_LENGTH,  # 预留 graph embedding 空间\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    input_ids = tokenized[\"input_ids\"].squeeze(0)\n",
+    "    attention_mask = tokenized[\"attention_mask\"].squeeze(0)\n",
+    "\n",
+    "    train_data.append({\n",
+    "        \"input_ids\": input_ids,\n",
+    "        \"attention_mask\": attention_mask,\n",
+    "        \"labels\": input_ids.clone(),\n",
+    "        \"graph_embedding\": graph_embedding,  # `graph_embedding` 存入\n",
+    "    })\n",
+    "\n",
+    "print(\"🚀 处理后数据条数:\", len(train_data))\n",
+    "print(\"✅ 示例数据:\", train_data[0])\n",
+    "torch.save(train_data, \"train_data.pt\")\n",
+    "print(\"✅ train_data 已保存到 train_data.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "05a48aa8-c597-4ff1-9569-aa210f4f1f5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path, num_heads=8):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 LLM 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        # ✅ 1. 线性变换，将 `graph_embedding` 从 512 维映射到 `hidden_size`\n",
+    "        self.linear1 = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力层\n",
+    "        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.config.hidden_size, num_heads=num_heads, batch_first=True)\n",
+    "\n",
+    "        # ✅ 3. 线性变换\n",
+    "        self.linear2 = nn.Linear(self.config.hidden_size, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 残差连接 + LayerNorm\n",
+    "        self.norm = nn.LayerNorm(self.config.hidden_size)\n",
+    "    \n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 1. 线性变换 `graph_embedding`\n",
+    "        graph_embedding_token = self.linear1(graph_embedding)  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力计算（自注意力机制）\n",
+    "        attn_output, _ = self.multihead_attn(graph_embedding_token, graph_embedding_token, graph_embedding_token)\n",
+    "        \n",
+    "        # ✅ 3. 线性层 + 残差连接\n",
+    "        graph_embedding_token = self.linear2(attn_output) + graph_embedding_token  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 归一化\n",
+    "        graph_embedding_token = self.norm(graph_embedding_token)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    def generate(self, inputs, graph_embedding, max_length=500, temperature=0.7, top_k=50, top_p=0.9):\n",
+    "        \"\"\"\n",
+    "        ✅ 自定义 `generate()` 方法，支持 `graph_embedding`\n",
+    "        `input_text`: 需要生成文本的输入\n",
+    "        `graph_embedding`: 形状为 (1, 512) 的张量\n",
+    "        \"\"\"\n",
+    "\n",
+    "        # ✅ 2. 处理 `graph_embedding`\n",
+    "        graph_embedding_token = self.linear1(graph_embedding)  # (1, 1, hidden_size)\n",
+    "        attn_output, _ = self.multihead_attn(graph_embedding_token, graph_embedding_token, graph_embedding_token)\n",
+    "        graph_embedding_token = self.linear2(attn_output) + graph_embedding_token  # (1, 1, hidden_size)\n",
+    "        graph_embedding_token = self.norm(graph_embedding_token)\n",
+    "\n",
+    "        # ✅ 3. 获取 Token Embeddings 并拼接\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(inputs[\"input_ids\"])  # (1, seq_len, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (1, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 调整 `attention_mask`\n",
+    "        if \"attention_mask\" in inputs:\n",
+    "            graph_mask = torch.ones((inputs[\"attention_mask\"].shape[0], 1), device=inputs[\"attention_mask\"].device, dtype=inputs[\"attention_mask\"].dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, inputs[\"attention_mask\"]], dim=1)  # (1, seq_len+1)\n",
+    "        else:\n",
+    "            attention_mask = None\n",
+    "\n",
+    "        # ✅ 5. 进行文本生成\n",
+    "        with torch.no_grad():\n",
+    "            output_ids = self.model.generate(\n",
+    "                inputs_embeds=inputs_embeds,\n",
+    "                attention_mask=attention_mask,\n",
+    "                max_length=max_length,\n",
+    "                temperature=temperature,\n",
+    "                top_k=top_k,\n",
+    "                top_p=top_p,\n",
+    "                num_return_sequences=1\n",
+    "            )\n",
+    "\n",
+    "        # ✅ 6. 解码输出\n",
+    "        generated_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "        return generated_text\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        # ✅ 1. 调用 `super().from_pretrained()` 加载 LLM\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "\n",
+    "        # ✅ 2. 初始化 `MLP + MultiheadAttention` 结构\n",
+    "        model.linear1 = nn.Linear(512, model.config.hidden_size)\n",
+    "        model.multihead_attn = nn.MultiheadAttention(embed_dim=model.config.hidden_size, num_heads=8, batch_first=True)\n",
+    "        model.linear2 = nn.Linear(model.config.hidden_size, model.config.hidden_size)\n",
+    "        model.norm = nn.LayerNorm(model.config.hidden_size)\n",
+    "\n",
+    "        return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "73ae15d9-c9d9-4e64-ac8b-2d5877eac984",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "21c8df04-0dc2-436c-aaaf-74a885f734d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b50f0cd6d784f598cc64a40cff40f38",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Qwen2ForCausalLM(\n",
+       "  (model): Qwen2Model(\n",
+       "    (embed_tokens): Embedding(151936, 1536)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-27): 28 x Qwen2DecoderLayer(\n",
+       "        (self_attn): Qwen2Attention(\n",
+       "          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
+       "        )\n",
+       "        (mlp): Qwen2MLP(\n",
+       "          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "    (rotary_emb): Qwen2RotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
+       "  (linear1): Linear(in_features=512, out_features=1536, bias=True)\n",
+       "  (multihead_attn): MultiheadAttention(\n",
+       "    (out_proj): NonDynamicallyQuantizableLinear(in_features=1536, out_features=1536, bias=True)\n",
+       "  )\n",
+       "  (linear2): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "  (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model2\"\n",
+    "model = GraphAwareLM.from_pretrained(\"/workspace/results3/checkpoint-3000\").to(device)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "51995891-8906-4049-9401-2d22e06a84e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([[-0.0380, -0.0350, -0.0423,  ...,  0.0213,  0.0148, -0.0047],\n",
+      "        [ 0.0131,  0.0388, -0.0378,  ...,  0.0399, -0.0309, -0.0342],\n",
+      "        [ 0.0084, -0.0116,  0.0259,  ...,  0.0344,  0.0268, -0.0062],\n",
+      "        ...,\n",
+      "        [ 0.0080, -0.0073, -0.0023,  ..., -0.0120,  0.0387,  0.0209],\n",
+      "        [ 0.0277,  0.0326,  0.0270,  ...,  0.0124, -0.0348,  0.0389],\n",
+      "        [ 0.0184, -0.0410, -0.0415,  ...,  0.0255, -0.0429, -0.0386]],\n",
+      "       device='cuda:0', requires_grad=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.graph_proj.weight)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7a8562c0-8d55-4412-8f89-de20bae0f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n",
+    "\n",
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0).to(device)\n",
+    "\n",
+    "question1 = conversations[0][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "attention_mask = inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4bd7493f-ca8d-4c28-914d-95b1c30f8fcc",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Tensor' object has no attribute 'update'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m generated_text \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_embedding\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1982\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m   1979\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)  \u001b[38;5;66;03m# Pull this out first, we only use it for stopping criteria\u001b[39;00m\n\u001b[1;32m   1980\u001b[0m assistant_tokenizer \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massistant_tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)  \u001b[38;5;66;03m# only used for assisted generation\u001b[39;00m\n\u001b[0;32m-> 1982\u001b[0m generation_config, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_generation_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1983\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_model_kwargs(model_kwargs\u001b[38;5;241m.\u001b[39mcopy())\n\u001b[1;32m   1984\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_assistant(assistant_model, tokenizer, assistant_tokenizer)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1549\u001b[0m, in \u001b[0;36mGenerationMixin._prepare_generation_config\u001b[0;34m(self, generation_config, **kwargs)\u001b[0m\n\u001b[1;32m   1547\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torchdynamo_compiling():\n\u001b[1;32m   1548\u001b[0m     generation_config \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(generation_config)\n\u001b[0;32m-> 1549\u001b[0m     model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupdate\u001b[49m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1550\u001b[0m     \u001b[38;5;66;03m# If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model\u001b[39;00m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m using_model_generation_config:\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Tensor' object has no attribute 'update'"
+     ]
+    }
+   ],
+   "source": [
+    "generated_text = model.generate(inputs, graph_embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62f40327-f102-4259-80a5-8761d5d7d3c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "067a0cf7-3010-4b6b-b2aa-d4ce95010d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "模型回复：  How\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ✅ 进行前向传播\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, graph_embedding=graph_embedding)\n",
+    "\n",
+    "# ✅ 提取 logits 并进行贪心解码\n",
+    "logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "predicted_id = torch.argmax(logits, dim=-1)  # 选择概率最大的 token\n",
+    "\n",
+    "# ✅ 反向编码为文本\n",
+    "response_text = tokenizer.decode(predicted_id, skip_special_tokens=True)\n",
+    "\n",
+    "print(\"模型回复：\", response_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ae38ed68-bc6a-4bc3-aee8-d54d2dd689ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: What are the signal definitions in the Verilog code for the calculator module, and what are their purposes? The Verilog code defines the inputs A, B, and C, and the output Y. A and B are the operands, C is the carry-in, and Y is the result. The purpose of the module is to perform a 2-bit adder, which adds two 2-bit numbers, and the output is the sum. The inputs A and B are the operands, C is the carry-in, and Y is the result. The module is designed to handle the addition operation of two 2-bit numbers, with a carry-in, and a 3-bit output. The implementation involves using logic gates to perform the addition operation, with the sum output connected to the gates. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, involving basic gates and an adder circuit. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with no complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_new_tokens = 500\n",
+    "generated_ids = input_ids.clone()\n",
+    "generated_attention_mask = attention_mask.clone()\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=generated_attention_mask,  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1)  # 贪心解码\n",
+    "    # print(next_token)\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token.unsqueeze(1)], dim=1)\n",
+    "\n",
+    "    # print(generated_ids)\n",
+    "\n",
+    "    if next_token.item() == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "    generated_attention_mask = torch.cat(\n",
+    "        [generated_attention_mask, torch.ones((1, 1), dtype=generated_attention_mask.dtype, device=generated_attention_mask.device)], dim=1\n",
+    "    ) \n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "803f41fe-f504-4c2a-96b4-afc2cd437d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[151646,   3838,    525,    279,   8286,  17473,    304,    279,   6250,\n",
+       "          50773,   2038,    369,    279,  29952,   4688,     11,    323,   1128,\n",
+       "            525,    862,   9895,     30]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d1396b-4d20-4a76-a092-b26a587a76ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

eval.ipynb ADDED Viewed

	@@ -0,0 +1,406 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "MODEL_NAME = \"/workspace/model\"\n",
+    "model_token = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_token)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'What are the signal definitions in the Verilog code for the calculator module, and what are their purposes?'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "question1 = conversations[0][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "question1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "from torch.utils.data import Dataset\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.model = AutoModelForCausalLM.from_config(config)\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding.squeeze(0))  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "        model.graph_proj = nn.Linear(512, model.config.hidden_size)\n",
+    "        return model\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "         -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "         -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "         -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "         -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "          0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "         -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "         -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "         -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "         -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "          0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "          0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "         -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "          1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "         -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "         -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "         -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "         -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "         -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "          0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "          0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "         -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "          1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "          0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "         -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "         -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "         -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "         -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "         -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "          2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "         -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "         -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "         -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "          0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "         -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "         -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "          1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "         -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "         -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "         -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "         -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "         -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "          1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "          0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "         -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "          1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "         -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "          0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "         -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "         -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "          2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "         -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "          0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "         -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "         -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "          0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "          0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "         -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "          0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "         -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "         -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "         -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "         -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "          1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 加载分词器\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_token)\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "graph_embedding.to(device)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "The size of tensor a (23) must match the size of tensor b (22) at non-singleton dimension 3",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[14], line 6\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(max_new_tokens):\n\u001b[1;32m      4\u001b[0m     \u001b[38;5;66;03m# ✅ 计算 logits 并进行生成\u001b[39;00m\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m----> 6\u001b[0m         outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m            \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerated_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# (batch_size, seq_len)\u001b[39;49;00m\n\u001b[1;32m      8\u001b[0m \u001b[43m            \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mattention_mask\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# (batch_size, seq_len)\u001b[39;49;00m\n\u001b[1;32m      9\u001b[0m \u001b[43m            \u001b[49m\u001b[43mgraph_embedding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_embedding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m      \u001b[49m\u001b[38;5;66;43;03m# (batch_size, 512)\u001b[39;49;00m\n\u001b[1;32m     10\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     13\u001b[0m     logits \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mlogits[:, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, :]  \u001b[38;5;66;03m# 取最后一个 token 的 logits\u001b[39;00m\n\u001b[1;32m     14\u001b[0m     next_token \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, keepdim\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)  \u001b[38;5;66;03m# 贪心解码\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/utils/deprecation.py:172\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>.wrapper.<locals>.wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    168\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m minimum_action \u001b[38;5;129;01min\u001b[39;00m (Action\u001b[38;5;241m.\u001b[39mNOTIFY, Action\u001b[38;5;241m.\u001b[39mNOTIFY_ALWAYS) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torchdynamo_compiling():\n\u001b[1;32m    169\u001b[0m     \u001b[38;5;66;03m# DeprecationWarning is ignored by default, so we use FutureWarning instead\u001b[39;00m\n\u001b[1;32m    170\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:856\u001b[0m, in \u001b[0;36mQwen2ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[1;32m    853\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m    855\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    857\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    858\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    859\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    860\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    861\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    863\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    864\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    865\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    866\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    867\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    868\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    870\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    871\u001b[0m \u001b[38;5;66;03m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:579\u001b[0m, in \u001b[0;36mQwen2Model.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **flash_attn_kwargs)\u001b[0m\n\u001b[1;32m    567\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    568\u001b[0m         decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    569\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    576\u001b[0m         position_embeddings,\n\u001b[1;32m    577\u001b[0m     )\n\u001b[1;32m    578\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 579\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    580\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    581\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcausal_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    582\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    583\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    584\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    585\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    586\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    587\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    588\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mflash_attn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    589\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    591\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:260\u001b[0m, in \u001b[0;36mQwen2DecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings, **kwargs)\u001b[0m\n\u001b[1;32m    257\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_layernorm(hidden_states)\n\u001b[1;32m    259\u001b[0m \u001b[38;5;66;03m# Self Attention\u001b[39;00m\n\u001b[0;32m--> 260\u001b[0m hidden_states, self_attn_weights \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    261\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    262\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    263\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    264\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    265\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    266\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    267\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    271\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m residual \u001b[38;5;241m+\u001b[39m hidden_states\n\u001b[1;32m    273\u001b[0m \u001b[38;5;66;03m# Fully Connected\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:192\u001b[0m, in \u001b[0;36mQwen2Attention.forward\u001b[0;34m(self, hidden_states, position_embeddings, attention_mask, past_key_value, cache_position, **kwargs)\u001b[0m\n\u001b[1;32m    189\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    190\u001b[0m         attention_interface \u001b[38;5;241m=\u001b[39m ALL_ATTENTION_FUNCTIONS[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39m_attn_implementation]\n\u001b[0;32m--> 192\u001b[0m attn_output, attn_weights \u001b[38;5;241m=\u001b[39m \u001b[43mattention_interface\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    193\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    194\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquery_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    195\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkey_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    196\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvalue_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    197\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    198\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdropout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention_dropout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    199\u001b[0m \u001b[43m    \u001b[49m\u001b[43mscaling\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscaling\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    200\u001b[0m \u001b[43m    \u001b[49m\u001b[43msliding_window\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msliding_window\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# main diff with Llama\u001b[39;49;00m\n\u001b[1;32m    201\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    202\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    204\u001b[0m attn_output \u001b[38;5;241m=\u001b[39m attn_output\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m*\u001b[39minput_shape, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous()\n\u001b[1;32m    205\u001b[0m attn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mo_proj(attn_output)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/modeling_qwen2.py:123\u001b[0m, in \u001b[0;36meager_attention_forward\u001b[0;34m(module, query, key, value, attention_mask, scaling, dropout, **kwargs)\u001b[0m\n\u001b[1;32m    121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    122\u001b[0m     causal_mask \u001b[38;5;241m=\u001b[39m attention_mask[:, :, :, : key_states\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m]]\n\u001b[0;32m--> 123\u001b[0m     attn_weights \u001b[38;5;241m=\u001b[39m \u001b[43mattn_weights\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcausal_mask\u001b[49m\n\u001b[1;32m    125\u001b[0m attn_weights \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39msoftmax(attn_weights, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\u001b[38;5;241m.\u001b[39mto(query\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[1;32m    126\u001b[0m attn_weights \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mdropout(attn_weights, p\u001b[38;5;241m=\u001b[39mdropout, training\u001b[38;5;241m=\u001b[39mmodule\u001b[38;5;241m.\u001b[39mtraining)\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (23) must match the size of tensor b (22) at non-singleton dimension 3"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "generated_ids = inputs[\"input_ids\"]\n",
+    "max_new_tokens = 1024\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=inputs[\"attention_mask\"],  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1, keepdim=True)  # 贪心解码\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token], dim=-1)\n",
+    "\n",
+    "    if next_token[:, 0] == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: How does the code handle combinational logic? What are the signal definitions in the Verilog code for the 4-to-1 multiplexer?\n",
+      "The code uses assign statements to handle combinational logic. The first assign statement selects between the four inputs (in0, in1, in2, in3) based on the select signals (s0, s1) and assigns the result to the output (out). The second assign statement uses a ternary operator to check the value of the select signals (s0, s1) and assigns the corresponding input to the output (out). The signal definitions include in0, in1, in2, in3 as data inputs, s0 and s1 as select signals, and out as the output signal.\n",
+      "How does the code handle sequential logic? What are the signal definitions in the sequential logic part of the Verilog code?\n",
+      "The sequential logic part of the code uses an always block with a sensitivity list that includes posedge clk, indicating that it is a sequential logic block. The output (out) is updated on the rising edge of the clock signal (clk). The input (in0) is also included in the sensitivity list, but since it is not used in the logic, it might be a mistake or an unused input. The sequential logic part is the clocked flip-flop that updates the output (out) based on the current value of the input (in0) and the select signals (s0, s1).\n",
+      "What is the function of the circuit described in the Verilog code?\n",
+      "The circuit is a 4-to-1 multiplexer with a registered output. It selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop on the rising edge of the clock signal (clk). The output (out) is the value of the selected input stored in the flip-flop.\n",
+      "How can the circuit be implemented in hardware?\n",
+      "The circuit can be implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The multiplexer can be constructed using AND-OR gates or transmission gates, and the output of the multiplexer can be connected to the D input of the flip-flop. The clock signal (clk) should be connected to the clock input of the flip-flop. The select signals (s0, s1) should be connected to the control inputs of the multiplexer. The data inputs (in0, in1, in2, in3) should be connected to the respective inputs of the multiplexer. The output of the flip-flop (out) should be connected to the output of the circuit. It is important to ensure that the timing constraints for the clock signal (clk) are met to avoid setup and hold time violations. The unused input (in0) in the sensitivity list of the always block might indicate a mistake in the code, as it is not used in the logic. However, it could be a typo or an oversight in the code. The implementation should focus on the functional parts of the circuit, which are the multiplexer and the flip-flop. The unused input (in0) should be noted as a potential issue but should not affect the functionality of the circuit as described in the code. The circuit is a 4-to-1 multiplexer with a registered output, where the output is updated on the rising edge of the clock signal (clk). The multiplexer selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop. The circuit is implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The implementation should focus on the functional parts of the circuit, which are the multiplexer and the flip-flop, while noting the potential issue of the unused input (in0) in the sensitivity list of the always block. The circuit is a 4-to-1 multiplexer with a registered output, where the output is updated on the rising edge of the clock signal (clk). The multiplexer selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop. The circuit is implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The implementation should focus on the functional parts of the circuit, which are the multiplexer and the flip-flop, while noting the potential issue of the unused input (in0) in the sensitivity list of the always block. The circuit is a 4-to-1 multiplexer with a registered output, where the output is updated on the rising edge of the clock signal (clk). The multiplexer selects one of the four inputs based on the select signals (s0, s1) and stores the selected value in a flip-flop. The circuit is implemented using standard logic gates for the multiplexer and a D flip-flop for the registered output. The implementation should focus on the functional parts of the circuit\n"
+     ]
+    }
+   ],
+   "source": [
+    "generated_ids = inputs[\"input_ids\"]\n",
+    "max_new_tokens = 1024\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=inputs[\"attention_mask\"],  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1, keepdim=True)  # 贪心解码\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token], dim=-1)\n",
+    "\n",
+    "    if next_token[:, 0] == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model\"\n",
+    "model = GraphAwareLM.from_pretrained(model_path)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

final_Graph.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5344e1faf783838cb4db7cd8bbbfdd3e4f01189277442d84682bcdaa1e4b9ac3
+size 261383982

graph_train.ipynb ADDED Viewed

	@@ -0,0 +1,1591 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fa17529d-eaa7-473e-9d2d-cc05a0120a51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\" \n",
+    "GRAPH_LENGTH = 512\n",
+    "HF_NAME = \"KSU-HW-SEC/r1q1.5_graph_lora_new\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bba6e6db-4b79-4461-ba13-75fd41019358",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CUDA 可用: True\n",
+      "GPU 数量: 1\n",
+      "当前 GPU: 0\n",
+      "GPU 名称: NVIDIA A100 80GB PCIe\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install transformers accelerate datasets\n",
+    "# !pip install galora\n",
+    "# !pip install huggingface_hub\n",
+    "import torch\n",
+    "print(\"CUDA 可用:\", torch.cuda.is_available())\n",
+    "print(\"GPU 数量:\", torch.cuda.device_count())\n",
+    "print(\"当前 GPU:\", torch.cuda.current_device())\n",
+    "print(\"GPU 名称:\", torch.cuda.get_device_name(torch.cuda.current_device()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ef5551ca-89e2-4488-8e68-1c8d964de039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 1100 + GRAPH_LENGTH  # 最大序列长度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8e283f49-fde4-46e2-9891-dbc304058f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:48: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_081255-v0v96nik</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/v0v96nik' target=\"_blank\">experi0304</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/v0v96nik' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/v0v96nik</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5310' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5310/5310 1:23:11, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>5.349900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>5.305900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>4.849500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>3.910800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>3.325600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>3.144900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>2.904200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>2.082100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>1.214300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>1.011600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>550</td>\n",
+       "      <td>0.889300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>600</td>\n",
+       "      <td>0.907300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>650</td>\n",
+       "      <td>1.190400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>700</td>\n",
+       "      <td>1.889100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>750</td>\n",
+       "      <td>4.505600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>800</td>\n",
+       "      <td>6.402800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>850</td>\n",
+       "      <td>6.479300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>900</td>\n",
+       "      <td>7.337900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>950</td>\n",
+       "      <td>8.937600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>8.938700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1050</td>\n",
+       "      <td>8.860100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1100</td>\n",
+       "      <td>8.693600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1150</td>\n",
+       "      <td>9.234000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1200</td>\n",
+       "      <td>9.347500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1250</td>\n",
+       "      <td>8.010300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1300</td>\n",
+       "      <td>5.952900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1350</td>\n",
+       "      <td>5.205900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1400</td>\n",
+       "      <td>4.969600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1450</td>\n",
+       "      <td>4.884600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>4.934200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1550</td>\n",
+       "      <td>5.156900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1600</td>\n",
+       "      <td>5.115500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1650</td>\n",
+       "      <td>5.373600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1700</td>\n",
+       "      <td>4.481800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1750</td>\n",
+       "      <td>3.957000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1800</td>\n",
+       "      <td>3.092500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1850</td>\n",
+       "      <td>1.791000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1900</td>\n",
+       "      <td>1.934400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1950</td>\n",
+       "      <td>2.176800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>2.112400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2050</td>\n",
+       "      <td>2.127900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2100</td>\n",
+       "      <td>2.390200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2150</td>\n",
+       "      <td>3.091400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2200</td>\n",
+       "      <td>3.959500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2250</td>\n",
+       "      <td>3.905000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2300</td>\n",
+       "      <td>3.777500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2350</td>\n",
+       "      <td>3.282900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2400</td>\n",
+       "      <td>2.630300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2450</td>\n",
+       "      <td>3.705000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>4.266300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2550</td>\n",
+       "      <td>4.285300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2600</td>\n",
+       "      <td>4.634000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2650</td>\n",
+       "      <td>4.474700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2700</td>\n",
+       "      <td>3.591300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2750</td>\n",
+       "      <td>2.486800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2800</td>\n",
+       "      <td>1.911800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2850</td>\n",
+       "      <td>2.088100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2900</td>\n",
+       "      <td>2.015400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2950</td>\n",
+       "      <td>1.988500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>1.976900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3050</td>\n",
+       "      <td>2.097700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3100</td>\n",
+       "      <td>1.987400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3150</td>\n",
+       "      <td>2.065000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3200</td>\n",
+       "      <td>2.112100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3250</td>\n",
+       "      <td>2.075300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3300</td>\n",
+       "      <td>1.983300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3350</td>\n",
+       "      <td>2.181900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3400</td>\n",
+       "      <td>2.446500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3450</td>\n",
+       "      <td>2.434200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>2.357000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3550</td>\n",
+       "      <td>2.157400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3600</td>\n",
+       "      <td>1.992900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3650</td>\n",
+       "      <td>2.018400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3700</td>\n",
+       "      <td>2.010200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3750</td>\n",
+       "      <td>2.009500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3800</td>\n",
+       "      <td>2.034900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3850</td>\n",
+       "      <td>2.038800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3900</td>\n",
+       "      <td>2.007600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3950</td>\n",
+       "      <td>1.983200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>2.005300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4050</td>\n",
+       "      <td>2.014900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4100</td>\n",
+       "      <td>2.018100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4150</td>\n",
+       "      <td>2.033900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4200</td>\n",
+       "      <td>2.024600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4250</td>\n",
+       "      <td>1.995300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4300</td>\n",
+       "      <td>2.018000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4350</td>\n",
+       "      <td>1.998300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4400</td>\n",
+       "      <td>2.032800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4450</td>\n",
+       "      <td>1.985900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4500</td>\n",
+       "      <td>1.967700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4550</td>\n",
+       "      <td>1.989400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4600</td>\n",
+       "      <td>2.004700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4650</td>\n",
+       "      <td>2.005800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4700</td>\n",
+       "      <td>2.014400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4750</td>\n",
+       "      <td>2.009200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4800</td>\n",
+       "      <td>2.002200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4850</td>\n",
+       "      <td>1.914300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4900</td>\n",
+       "      <td>2.016900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4950</td>\n",
+       "      <td>1.972900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>2.010300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5050</td>\n",
+       "      <td>2.046600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5100</td>\n",
+       "      <td>1.993900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5150</td>\n",
+       "      <td>2.084500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5200</td>\n",
+       "      <td>2.011900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5250</td>\n",
+       "      <td>1.996500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5300</td>\n",
+       "      <td>1.997900</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new/commit/231f89403dca9aa67966e4f321e62bdb41076960', commit_message='End of training', commit_description='', oid='231f89403dca9aa67966e4f321e62bdb41076960', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new', endpoint='https://huggingface.co', repo_type='model', repo_id='KSU-HW-SEC/r1q1.5_graph_lora_new'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "\n",
+    "        # self.model = AutoModelForCausalLM.from_config(config)\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "# 1. 加载模型的配置\n",
+    "config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 2. 使用配置创建 GraphAwareLM 实例\n",
+    "model = GraphAwareLM.from_config(config) \n",
+    "\n",
+    "pretrained_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "model.load_state_dict(pretrained_model.state_dict(), strict=False)\n",
+    "\n",
+    "# ✅ 载入修改后的 `GraphAwareLM` 模型\n",
+    "# model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "# model.config.use_sliding_window_attention = False\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    bf16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi0304\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.save_model(\"/workspace/model\")\n",
+    "trainer.push_to_hub()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d2ebf87-402e-444d-8599-96c313f1b7fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 处理后数据条数: 12384\n",
+      "✅ 示例数据: {'input_ids': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]), 'labels': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'graph_embedding': tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+      "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+      "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+      "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+      "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+      "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+      "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+      "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+      "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+      "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+      "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+      "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+      "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+      "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+      "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+      "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+      "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+      "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+      "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+      "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+      "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+      "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+      "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+      "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+      "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+      "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+      "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+      "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+      "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+      "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+      "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+      "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+      "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+      "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+      "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+      "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+      "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+      "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+      "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+      "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+      "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+      "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+      "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+      "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+      "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+      "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+      "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+      "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+      "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+      "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+      "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+      "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+      "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+      "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+      "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+      "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+      "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+      "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+      "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+      "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+      "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+      "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+      "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+      "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635])}\n",
+      "✅ train_data 已保存到 train_data.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n",
+    "\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "\n",
+    "for sample in data:\n",
+    "    conversations = sample.get(\"conversations\", [])\n",
+    "    embeddings = sample.get(\"embedding\", []) \n",
+    "\n",
+    "    if not isinstance(embeddings, list) or len(embeddings) == 0:\n",
+    "        print(f\"无效的 embedding，跳过样本：{sample}\")\n",
+    "        continue\n",
+    "\n",
+    "    graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0)  # [512]\n",
+    "\n",
+    "    #拼接所有对话\n",
+    "    dialogue_text = \"\"\n",
+    "    for conv in conversations:\n",
+    "        role = conv[\"from\"]  # \"human\" 或 \"gpt\"\n",
+    "        content = conv[\"value\"]\n",
+    "        content = content.replace(\"<image>\", \"\") #去掉 <image>\n",
+    "        role_token = ROLE_TOKENS.get(role, f\"<|{role}|>\")  # 兼容性处理\n",
+    "        dialogue_text += f\"{role_token} {content}\\n\"\n",
+    "\n",
+    "    tokenized = tokenizer(\n",
+    "        dialogue_text,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=max_seq_length - GRAPH_LENGTH,  # 预留 graph embedding 空间\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    input_ids = tokenized[\"input_ids\"].squeeze(0)\n",
+    "    attention_mask = tokenized[\"attention_mask\"].squeeze(0)\n",
+    "\n",
+    "    train_data.append({\n",
+    "        \"input_ids\": input_ids,\n",
+    "        \"attention_mask\": attention_mask,\n",
+    "        \"labels\": input_ids.clone(),\n",
+    "        \"graph_embedding\": graph_embedding,  # `graph_embedding` 存入\n",
+    "    })\n",
+    "\n",
+    "print(\"🚀 处理后数据条数:\", len(train_data))\n",
+    "print(\"✅ 示例数据:\", train_data[0])\n",
+    "torch.save(train_data, \"train_data.pt\")\n",
+    "print(\"✅ train_data 已保存到 train_data.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a33bffb9-2ff9-4a4d-af2c-b89b30a69f7d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:49: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_074031-ofm5zhvd</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/ofm5zhvd' target=\"_blank\">experi0304</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/ofm5zhvd' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/ofm5zhvd</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='89' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [  89/5310 01:06 < 1:06:24, 1.31 it/s, Epoch 0.05/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 150\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;66;03m# ✅ 训练\u001b[39;00m\n\u001b[1;32m    144\u001b[0m trainer \u001b[38;5;241m=\u001b[39m GraphTrainer(\n\u001b[1;32m    145\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    146\u001b[0m     args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[1;32m    147\u001b[0m     train_dataset\u001b[38;5;241m=\u001b[39mtrain_dataset,\n\u001b[1;32m    148\u001b[0m )\n\u001b[0;32m--> 150\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    151\u001b[0m trainer\u001b[38;5;241m.\u001b[39mpush_to_hub()\n\u001b[1;32m    152\u001b[0m trainer\u001b[38;5;241m.\u001b[39msave_model(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/workspace/model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2232\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   2229\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   2230\u001b[0m     \u001b[38;5;66;03m# Disable progress bars when uploading models during checkpoints to avoid polluting stdout\u001b[39;00m\n\u001b[1;32m   2231\u001b[0m     hf_hub_utils\u001b[38;5;241m.\u001b[39mdisable_progress_bars()\n\u001b[0;32m-> 2232\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2233\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2234\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2235\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2236\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2237\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2238\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m   2239\u001b[0m     hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2548\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2541\u001b[0m context \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   2542\u001b[0m     functools\u001b[38;5;241m.\u001b[39mpartial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mno_sync, model\u001b[38;5;241m=\u001b[39mmodel)\n\u001b[1;32m   2543\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch_samples) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m   2544\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m!=\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mDEEPSPEED\n\u001b[1;32m   2545\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m contextlib\u001b[38;5;241m.\u001b[39mnullcontext\n\u001b[1;32m   2546\u001b[0m )\n\u001b[1;32m   2547\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[0;32m-> 2548\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2550\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2551\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2552\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2553\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2554\u001b[0m ):\n\u001b[1;32m   2555\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2556\u001b[0m     tr_loss \u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m+\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3740\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m   3737\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m==\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mDEEPSPEED:\n\u001b[1;32m   3738\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscale_wrt_gas\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m-> 3740\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccelerator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mloss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\u001b[38;5;241m.\u001b[39mdetach()\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py:2325\u001b[0m, in \u001b[0;36mAccelerator.backward\u001b[0;34m(self, loss, **kwargs)\u001b[0m\n\u001b[1;32m   2323\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m   2324\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscaler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 2325\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscaler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale\u001b[49m\u001b[43m(\u001b[49m\u001b[43mloss\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2326\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m learning_rate \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhas_lomo_optimizer:\n\u001b[1;32m   2327\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlomo_backward(loss, learning_rate)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/_tensor.py:492\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    482\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    483\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    484\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    485\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    490\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    491\u001b[0m     )\n\u001b[0;32m--> 492\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py:251\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    246\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m    248\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m    249\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    250\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 251\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    252\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    253\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    254\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    255\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    256\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    257\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    258\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    259\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(config)\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "\n",
+    "# ✅ 载入修改后的 `GraphAwareLM` 模型\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "# model.config.use_sliding_window_attention = False\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    fp16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi0304\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.push_to_hub()\n",
+    "trainer.save_model(\"/workspace/model\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "05a48aa8-c597-4ff1-9569-aa210f4f1f5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "\n",
+    "    \n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.graph_proj = nn.Linear(512, config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding.squeeze(0))  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "        model.graph_proj = nn.Linear(512, model.config.hidden_size)\n",
+    "        return model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "73ae15d9-c9d9-4e64-ac8b-2d5877eac984",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "21c8df04-0dc2-436c-aaaf-74a885f734d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Qwen2ForCausalLM(\n",
+       "  (model): Qwen2Model(\n",
+       "    (embed_tokens): Embedding(151936, 1536)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-27): 28 x Qwen2DecoderLayer(\n",
+       "        (self_attn): Qwen2Attention(\n",
+       "          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
+       "        )\n",
+       "        (mlp): Qwen2MLP(\n",
+       "          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "    (rotary_emb): Qwen2RotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
+       "  (graph_proj): Linear(in_features=512, out_features=1536, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model\"\n",
+    "model = GraphAwareLM.from_pretrained(model_path).to(device)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7a8562c0-8d55-4412-8f89-de20bae0f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n",
+    "\n",
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32).to(device)\n",
+    "\n",
+    "question1 = conversations[4][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "attention_mask = inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62f40327-f102-4259-80a5-8761d5d7d3c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "         -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "         -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "         -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "         -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "          0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "         -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "         -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "         -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "         -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "          0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "          0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "         -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "          1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "         -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "         -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "         -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "         -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "         -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "          0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "          0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "         -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "          1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "          0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "         -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "         -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "         -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "         -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "         -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "          2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "         -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "         -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "         -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "          0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "         -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "         -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "          1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "         -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "         -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "         -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "         -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "         -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "          1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "          0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "         -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "          1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "         -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "          0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "         -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "         -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "          2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "         -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "          0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "         -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "         -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "          0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "          0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "         -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "          0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "         -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "         -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "         -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "         -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "          1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "067a0cf7-3010-4b6b-b2aa-d4ce95010d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "模型回复：  How\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ✅ 进行前向传播\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, graph_embedding=graph_embedding)\n",
+    "\n",
+    "# ✅ 提取 logits 并进行贪心解码\n",
+    "logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "predicted_id = torch.argmax(logits, dim=-1)  # 选择概率最大的 token\n",
+    "\n",
+    "# ✅ 反向编码为文本\n",
+    "response_text = tokenizer.decode(predicted_id, skip_special_tokens=True)\n",
+    "\n",
+    "print(\"模型回复：\", response_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ae38ed68-bc6a-4bc3-aee8-d54d2dd689ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: Is there any sequential logic in the module, and if so, how is it handled? What are the module's inputs and outputs?\n",
+      "What are the module's inputs and outputs?\n",
+      "What are the module's inputs and outputs?\n",
+      "What are the module's inputs and outputs?\n",
+      "What is the module's input, and what is the module's output, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's output, and what is the module's input, and what is the module's output, and what is the module's output, and what is the module's output, and what is the module's output, and what is the module's output, and module's output, and module's input, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output. Is the module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module's output, and module\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_new_tokens = 1024\n",
+    "generated_ids = input_ids.clone()\n",
+    "generated_attention_mask = attention_mask.clone()\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=generated_attention_mask,  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1)  # 贪心解码\n",
+    "    # print(next_token)\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token.unsqueeze(1)], dim=1)\n",
+    "\n",
+    "    # print(generated_ids)\n",
+    "\n",
+    "    if next_token.item() == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "    generated_attention_mask = torch.cat(\n",
+    "        [generated_attention_mask, torch.ones((1, 1), dtype=generated_attention_mask.dtype, device=generated_attention_mask.device)], dim=1\n",
+    "    ) \n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "803f41fe-f504-4c2a-96b4-afc2cd437d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[151646,   3838,    525,    279,   8286,  17473,    304,    279,   6250,\n",
+       "          50773,   2038,    369,    279,  29952,   4688,     11,    323,   1128,\n",
+       "            525,    862,   9895,     30]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d1396b-4d20-4a76-a092-b26a587a76ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

graph_train2.ipynb ADDED Viewed

	@@ -0,0 +1,1506 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fa17529d-eaa7-473e-9d2d-cc05a0120a51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\" \n",
+    "GRAPH_LENGTH = 512\n",
+    "HF_NAME = \"KSU-HW-SEC/r1q1.5_graph_lora_new2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bba6e6db-4b79-4461-ba13-75fd41019358",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CUDA 可用: True\n",
+      "GPU 数量: 1\n",
+      "当前 GPU: 0\n",
+      "GPU 名称: NVIDIA A100 80GB PCIe\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install transformers accelerate datasets\n",
+    "# !pip install galora\n",
+    "# !pip install huggingface_hub\n",
+    "import torch\n",
+    "print(\"CUDA 可用:\", torch.cuda.is_available())\n",
+    "print(\"GPU 数量:\", torch.cuda.device_count())\n",
+    "print(\"当前 GPU:\", torch.cuda.current_device())\n",
+    "print(\"GPU 名称:\", torch.cuda.get_device_name(torch.cuda.current_device()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ef5551ca-89e2-4488-8e68-1c8d964de039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 1100 + GRAPH_LENGTH  # 最大序列长度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8e283f49-fde4-46e2-9891-dbc304058f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:48: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_111730-i9v1vlu1</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/i9v1vlu1' target=\"_blank\">experi030402</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/i9v1vlu1' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/i9v1vlu1</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5310' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5310/5310 1:34:08, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>5.319300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>3.641300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>1.521800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>1.027500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>0.922400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>0.866900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>0.800500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>0.721600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>0.740400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.737000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>550</td>\n",
+       "      <td>0.713500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>600</td>\n",
+       "      <td>0.747000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>650</td>\n",
+       "      <td>0.869500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>700</td>\n",
+       "      <td>1.473300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>750</td>\n",
+       "      <td>0.753000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>800</td>\n",
+       "      <td>0.741300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>850</td>\n",
+       "      <td>0.751400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>900</td>\n",
+       "      <td>0.787600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>950</td>\n",
+       "      <td>0.783200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.780200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1050</td>\n",
+       "      <td>1.012900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1100</td>\n",
+       "      <td>1.411700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1150</td>\n",
+       "      <td>1.536400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1200</td>\n",
+       "      <td>0.853800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1250</td>\n",
+       "      <td>0.756500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1300</td>\n",
+       "      <td>0.750800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1350</td>\n",
+       "      <td>0.747400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1400</td>\n",
+       "      <td>0.844400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1450</td>\n",
+       "      <td>0.858400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>1.053400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1550</td>\n",
+       "      <td>1.591600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1600</td>\n",
+       "      <td>1.498900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1650</td>\n",
+       "      <td>1.471700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1700</td>\n",
+       "      <td>1.221100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1750</td>\n",
+       "      <td>1.802300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1800</td>\n",
+       "      <td>1.826000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1850</td>\n",
+       "      <td>1.857300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1900</td>\n",
+       "      <td>1.561800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1950</td>\n",
+       "      <td>1.398800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>1.398900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2050</td>\n",
+       "      <td>1.381600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2100</td>\n",
+       "      <td>0.890300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2150</td>\n",
+       "      <td>0.763700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2200</td>\n",
+       "      <td>0.753100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2250</td>\n",
+       "      <td>0.745500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2300</td>\n",
+       "      <td>1.186100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2350</td>\n",
+       "      <td>0.862000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2400</td>\n",
+       "      <td>1.024600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2450</td>\n",
+       "      <td>1.028400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>1.008500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2550</td>\n",
+       "      <td>0.942800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2600</td>\n",
+       "      <td>0.849700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2650</td>\n",
+       "      <td>0.771400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2700</td>\n",
+       "      <td>0.794100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2750</td>\n",
+       "      <td>0.819200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2800</td>\n",
+       "      <td>0.937500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2850</td>\n",
+       "      <td>1.064500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2900</td>\n",
+       "      <td>1.189300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2950</td>\n",
+       "      <td>1.071100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>1.003300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3050</td>\n",
+       "      <td>1.073900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3100</td>\n",
+       "      <td>1.043100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3150</td>\n",
+       "      <td>1.282600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3200</td>\n",
+       "      <td>2.145400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3250</td>\n",
+       "      <td>1.925800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3300</td>\n",
+       "      <td>2.005600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3350</td>\n",
+       "      <td>2.122600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3400</td>\n",
+       "      <td>2.163000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3450</td>\n",
+       "      <td>2.046600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>2.152200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3550</td>\n",
+       "      <td>2.151700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3600</td>\n",
+       "      <td>5.394900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3650</td>\n",
+       "      <td>4.677800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3700</td>\n",
+       "      <td>4.122200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3750</td>\n",
+       "      <td>3.710200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3800</td>\n",
+       "      <td>3.350800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3850</td>\n",
+       "      <td>3.126300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3900</td>\n",
+       "      <td>2.988700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3950</td>\n",
+       "      <td>2.872000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>2.848200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4050</td>\n",
+       "      <td>2.823900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4100</td>\n",
+       "      <td>2.781200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4150</td>\n",
+       "      <td>2.735000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4200</td>\n",
+       "      <td>2.725900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4250</td>\n",
+       "      <td>2.644400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4300</td>\n",
+       "      <td>2.700000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4350</td>\n",
+       "      <td>2.650100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4400</td>\n",
+       "      <td>2.704500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4450</td>\n",
+       "      <td>2.596700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4500</td>\n",
+       "      <td>2.510500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4550</td>\n",
+       "      <td>2.515800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4600</td>\n",
+       "      <td>2.498100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4650</td>\n",
+       "      <td>2.458900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4700</td>\n",
+       "      <td>2.449700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4750</td>\n",
+       "      <td>2.425000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4800</td>\n",
+       "      <td>2.362300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4850</td>\n",
+       "      <td>2.232000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4900</td>\n",
+       "      <td>2.361500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4950</td>\n",
+       "      <td>2.302300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>2.333900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5050</td>\n",
+       "      <td>2.367200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5100</td>\n",
+       "      <td>2.288300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5150</td>\n",
+       "      <td>2.426100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5200</td>\n",
+       "      <td>2.344100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5250</td>\n",
+       "      <td>2.283500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5300</td>\n",
+       "      <td>2.296500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new2/commit/291285a5f2155c79a0da893645d8df9bbca98f63', commit_message='End of training', commit_description='', oid='291285a5f2155c79a0da893645d8df9bbca98f63', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new2', endpoint='https://huggingface.co', repo_type='model', repo_id='KSU-HW-SEC/r1q1.5_graph_lora_new2'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 `MODEL_NAME` 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "# ✅ 载入微调模型\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# # 1. 加载模型的配置\n",
+    "# config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# # 2. 使用配置创建 GraphAwareLM 实例\n",
+    "# model = GraphAwareLM.from_config(config) \n",
+    "\n",
+    "# pretrained_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "# model.load_state_dict(pretrained_model.state_dict(), strict=False)\n",
+    "\n",
+    "# ✅ 载入修改后的 `GraphAwareLM` 模型\n",
+    "# model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "# model.config.use_sliding_window_attention = False\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results2\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    bf16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi030402\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.save_model(\"/workspace/model2\")\n",
+    "trainer.push_to_hub()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7a72ac3b-561e-41d3-ae93-99f20acf3188",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora-wandb', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora-wandb')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()\n",
+    "repo_name = \"r1q1.5_graph_lora-wandb\"  # 你的模型名称\n",
+    "api.create_repo(repo_name, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "73c434b9-5d58-4819-8526-24aa18ca1010",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2bf786e437064435b543c4b364404933",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "run-v0v96nik.wandb:   0%|          | 0.00/582k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d8d8867a8978418cbba012ae48c6a461",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "run-i9v1vlu1.wandb:   0%|          | 0.00/617k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aa41d13f7f204554a401f018f535d83a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4f6a00ed3d4e43c9806cb5050b812bf8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "run-e0v0giuw.wandb:   0%|          | 0.00/616k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/YiFzhao/r1q1.5_graph_lora-wandb/commit/81d72bb1534aa8769166ca2e2dd6f4a657ab3742', commit_message='upload wandb', commit_description='', oid='81d72bb1534aa8769166ca2e2dd6f4a657ab3742', pr_url=None, repo_url=RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora-wandb', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora-wandb'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import upload_folder\n",
+    "\n",
+    "upload_folder(\n",
+    "    folder_path = \"/workspace/wandb\",\n",
+    "    repo_id = \"YiFzhao/r1q1.5_graph_lora-wandb\",\n",
+    "    commit_message = \"upload wandb\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d2ebf87-402e-444d-8599-96c313f1b7fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 处理后数据条数: 12384\n",
+      "✅ 示例数据: {'input_ids': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]), 'labels': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'graph_embedding': tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+      "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+      "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+      "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+      "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+      "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+      "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+      "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+      "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+      "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+      "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+      "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+      "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+      "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+      "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+      "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+      "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+      "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+      "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+      "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+      "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+      "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+      "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+      "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+      "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+      "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+      "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+      "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+      "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+      "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+      "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+      "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+      "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+      "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+      "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+      "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+      "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+      "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+      "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+      "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+      "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+      "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+      "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+      "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+      "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+      "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+      "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+      "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+      "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+      "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+      "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+      "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+      "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+      "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+      "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+      "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+      "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+      "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+      "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+      "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+      "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+      "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+      "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+      "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635])}\n",
+      "✅ train_data 已保存到 train_data.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n",
+    "\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "\n",
+    "for sample in data:\n",
+    "    conversations = sample.get(\"conversations\", [])\n",
+    "    embeddings = sample.get(\"embedding\", []) \n",
+    "\n",
+    "    if not isinstance(embeddings, list) or len(embeddings) == 0:\n",
+    "        print(f\"无效的 embedding，跳过样本：{sample}\")\n",
+    "        continue\n",
+    "\n",
+    "    graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0)  # [512]\n",
+    "\n",
+    "    #拼接所有对话\n",
+    "    dialogue_text = \"\"\n",
+    "    for conv in conversations:\n",
+    "        role = conv[\"from\"]  # \"human\" 或 \"gpt\"\n",
+    "        content = conv[\"value\"]\n",
+    "        content = content.replace(\"<image>\", \"\") #去掉 <image>\n",
+    "        role_token = ROLE_TOKENS.get(role, f\"<|{role}|>\")  # 兼容性处理\n",
+    "        dialogue_text += f\"{role_token} {content}\\n\"\n",
+    "\n",
+    "    tokenized = tokenizer(\n",
+    "        dialogue_text,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=max_seq_length - GRAPH_LENGTH,  # 预留 graph embedding 空间\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    input_ids = tokenized[\"input_ids\"].squeeze(0)\n",
+    "    attention_mask = tokenized[\"attention_mask\"].squeeze(0)\n",
+    "\n",
+    "    train_data.append({\n",
+    "        \"input_ids\": input_ids,\n",
+    "        \"attention_mask\": attention_mask,\n",
+    "        \"labels\": input_ids.clone(),\n",
+    "        \"graph_embedding\": graph_embedding,  # `graph_embedding` 存入\n",
+    "    })\n",
+    "\n",
+    "print(\"🚀 处理后数据条数:\", len(train_data))\n",
+    "print(\"✅ 示例数据:\", train_data[0])\n",
+    "torch.save(train_data, \"train_data.pt\")\n",
+    "print(\"✅ train_data 已保存到 train_data.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "05a48aa8-c597-4ff1-9569-aa210f4f1f5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 `MODEL_NAME` 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        \n",
+    "        # ✅ 线性变换，把 512 维的 `graph_embedding` 映射到 `hidden_size`\n",
+    "        self.graph_proj = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 变换 graph embedding 到 hidden_size\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (batch_size, hidden_size)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    def generate_with_graph(self, inputs, graph_embedding, max_length=500, temperature=0.7, top_k=50, top_p=0.9):\n",
+    "        \"\"\"\n",
+    "        ✅ 自定义 `generate()`，支持 `graph_embedding`\n",
+    "        `input_text`: 需要生成文本的输入\n",
+    "        `graph_embedding`: 形状为 (1, 512) 的张量\n",
+    "        \"\"\"\n",
+    "        # ✅ 2. 处理 `graph_embedding`\n",
+    "        graph_embedding_token = self.graph_proj(graph_embedding)  # (1, hidden_size)\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (1, 1, hidden_size)\n",
+    "\n",
+    "        # ��� 3. 获取 Token Embeddings 并拼接\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(inputs[\"input_ids\"])  # (1, seq_len, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (1, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 调整 `attention_mask`\n",
+    "        if \"attention_mask\" in inputs:\n",
+    "            graph_mask = torch.ones((inputs[\"attention_mask\"].shape[0], 1), device=inputs[\"attention_mask\"].device, dtype=inputs[\"attention_mask\"].dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, inputs[\"attention_mask\"]], dim=1)  # (1, seq_len+1)\n",
+    "        else:\n",
+    "            attention_mask = None\n",
+    "\n",
+    "        # ✅ 5. 进行文本生成\n",
+    "        with torch.no_grad():\n",
+    "            output_ids = self.model.generate(\n",
+    "                inputs_embeds=inputs_embeds,\n",
+    "                attention_mask=attention_mask,\n",
+    "                max_length=max_length,\n",
+    "                temperature=temperature,\n",
+    "                top_k=top_k,\n",
+    "                top_p=top_p,\n",
+    "                num_return_sequences=1\n",
+    "            )\n",
+    "\n",
+    "        # ✅ 6. 解码生成的文本\n",
+    "        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "        return generated_text\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "        model.graph_proj = nn.Linear(512, model.config.hidden_size)\n",
+    "        return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "73ae15d9-c9d9-4e64-ac8b-2d5877eac984",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "21c8df04-0dc2-436c-aaaf-74a885f734d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ad289c5523340f39799ad11e3bc1bb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Qwen2ForCausalLM(\n",
+       "  (model): Qwen2Model(\n",
+       "    (embed_tokens): Embedding(151936, 1536)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-27): 28 x Qwen2DecoderLayer(\n",
+       "        (self_attn): Qwen2Attention(\n",
+       "          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
+       "        )\n",
+       "        (mlp): Qwen2MLP(\n",
+       "          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "    (rotary_emb): Qwen2RotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
+       "  (graph_proj): Linear(in_features=512, out_features=1536, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model2\"\n",
+    "model = GraphAwareLM.from_pretrained(\"/workspace/results2/checkpoint-5310\").to(device)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "51995891-8906-4049-9401-2d22e06a84e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([[-0.0380, -0.0350, -0.0423,  ...,  0.0213,  0.0148, -0.0047],\n",
+      "        [ 0.0131,  0.0388, -0.0378,  ...,  0.0399, -0.0309, -0.0342],\n",
+      "        [ 0.0084, -0.0116,  0.0259,  ...,  0.0344,  0.0268, -0.0062],\n",
+      "        ...,\n",
+      "        [ 0.0080, -0.0073, -0.0023,  ..., -0.0120,  0.0387,  0.0209],\n",
+      "        [ 0.0277,  0.0326,  0.0270,  ...,  0.0124, -0.0348,  0.0389],\n",
+      "        [ 0.0184, -0.0410, -0.0415,  ...,  0.0255, -0.0429, -0.0386]],\n",
+      "       device='cuda:0', requires_grad=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.graph_proj.weight)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7a8562c0-8d55-4412-8f89-de20bae0f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n",
+    "\n",
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0).to(device)\n",
+    "\n",
+    "question1 = conversations[4][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "attention_mask = inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "4bd7493f-ca8d-4c28-914d-95b1c30f8fcc",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Qwen2ForCausalLM' object has no attribute 'generate_with_graph'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m generated_text \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_with_graph\u001b[49m(inputs, graph_embedding)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1695\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   1693\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m   1694\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1695\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Qwen2ForCausalLM' object has no attribute 'generate_with_graph'"
+     ]
+    }
+   ],
+   "source": [
+    "generated_text = model.generate_with_graph(inputs, graph_embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62f40327-f102-4259-80a5-8761d5d7d3c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "067a0cf7-3010-4b6b-b2aa-d4ce95010d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "模型回复：  How\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ✅ 进行前向传播\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, graph_embedding=graph_embedding)\n",
+    "\n",
+    "# ✅ 提取 logits 并进行贪心解码\n",
+    "logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "predicted_id = torch.argmax(logits, dim=-1)  # 选择概率最大的 token\n",
+    "\n",
+    "# ✅ 反向编码为文本\n",
+    "response_text = tokenizer.decode(predicted_id, skip_special_tokens=True)\n",
+    "\n",
+    "print(\"模型回复：\", response_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "ae38ed68-bc6a-4bc3-aee8-d54d2dd689ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: Is there any sequential logic in the module, and if so, how is it handled? `data` is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit input, and the output is the output of the `data` is a 1-bit data, and the output is the output of the `data` is a 1-bit\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_new_tokens = 1024\n",
+    "generated_ids = input_ids.clone()\n",
+    "generated_attention_mask = attention_mask.clone()\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=generated_attention_mask,  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1)  # 贪心解码\n",
+    "    # print(next_token)\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token.unsqueeze(1)], dim=1)\n",
+    "\n",
+    "    # print(generated_ids)\n",
+    "\n",
+    "    if next_token.item() == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "    generated_attention_mask = torch.cat(\n",
+    "        [generated_attention_mask, torch.ones((1, 1), dtype=generated_attention_mask.dtype, device=generated_attention_mask.device)], dim=1\n",
+    "    ) \n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "803f41fe-f504-4c2a-96b4-afc2cd437d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[151646,   3838,    525,    279,   8286,  17473,    304,    279,   6250,\n",
+       "          50773,   2038,    369,    279,  29952,   4688,     11,    323,   1128,\n",
+       "            525,    862,   9895,     30]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d1396b-4d20-4a76-a092-b26a587a76ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

graph_train3.ipynb ADDED Viewed

	@@ -0,0 +1,1588 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fa17529d-eaa7-473e-9d2d-cc05a0120a51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\" \n",
+    "GRAPH_LENGTH = 512\n",
+    "HF_NAME = \"KSU-HW-SEC/r1q1.5_graph_lora_new3\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bba6e6db-4b79-4461-ba13-75fd41019358",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CUDA 可用: True\n",
+      "GPU 数量: 1\n",
+      "当前 GPU: 0\n",
+      "GPU 名称: NVIDIA A100 80GB PCIe\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install transformers accelerate datasets\n",
+    "# !pip install galora\n",
+    "# !pip install huggingface_hub\n",
+    "import torch\n",
+    "print(\"CUDA 可用:\", torch.cuda.is_available())\n",
+    "print(\"GPU 数量:\", torch.cuda.device_count())\n",
+    "print(\"当前 GPU:\", torch.cuda.current_device())\n",
+    "print(\"GPU 名称:\", torch.cuda.get_device_name(torch.cuda.current_device()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ef5551ca-89e2-4488-8e68-1c8d964de039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 1100 + GRAPH_LENGTH  # 最大序列长度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8e283f49-fde4-46e2-9891-dbc304058f0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train_data 重新加载成功，数据量: 12384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n",
+      "/usr/local/lib/python3.10/dist-packages/galore_torch/adamw.py:48: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m675775971\u001b[0m (\u001b[33myifang_zhao\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.7"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/wandb/run-20250304_134403-e0v0giuw</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/yifang_zhao/huggingface/runs/e0v0giuw' target=\"_blank\">experi030403</a></strong> to <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/yifang_zhao/huggingface' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/yifang_zhao/huggingface/runs/e0v0giuw' target=\"_blank\">https://wandb.ai/yifang_zhao/huggingface/runs/e0v0giuw</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5310' max='5310' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5310/5310 1:33:59, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>5.319300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>3.641300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>1.521800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>1.027500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>0.922400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>0.866900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>0.800500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>0.721600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>0.740400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.737000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>550</td>\n",
+       "      <td>0.713500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>600</td>\n",
+       "      <td>0.747000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>650</td>\n",
+       "      <td>0.869500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>700</td>\n",
+       "      <td>1.473300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>750</td>\n",
+       "      <td>0.753000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>800</td>\n",
+       "      <td>0.741300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>850</td>\n",
+       "      <td>0.751400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>900</td>\n",
+       "      <td>0.787600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>950</td>\n",
+       "      <td>0.783200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1000</td>\n",
+       "      <td>0.780200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1050</td>\n",
+       "      <td>1.012900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1100</td>\n",
+       "      <td>1.411700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1150</td>\n",
+       "      <td>1.536400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1200</td>\n",
+       "      <td>0.853800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1250</td>\n",
+       "      <td>0.756500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1300</td>\n",
+       "      <td>0.750800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1350</td>\n",
+       "      <td>0.747400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1400</td>\n",
+       "      <td>0.844400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1450</td>\n",
+       "      <td>0.858400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1500</td>\n",
+       "      <td>1.053400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1550</td>\n",
+       "      <td>1.591600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1600</td>\n",
+       "      <td>1.498900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1650</td>\n",
+       "      <td>1.471700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1700</td>\n",
+       "      <td>1.221100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1750</td>\n",
+       "      <td>1.802300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1800</td>\n",
+       "      <td>1.826000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1850</td>\n",
+       "      <td>1.857300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1900</td>\n",
+       "      <td>1.561800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1950</td>\n",
+       "      <td>1.398800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2000</td>\n",
+       "      <td>1.398900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2050</td>\n",
+       "      <td>1.381600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2100</td>\n",
+       "      <td>0.890300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2150</td>\n",
+       "      <td>0.763700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2200</td>\n",
+       "      <td>0.753100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2250</td>\n",
+       "      <td>0.745500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2300</td>\n",
+       "      <td>1.186100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2350</td>\n",
+       "      <td>0.862000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2400</td>\n",
+       "      <td>1.024600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2450</td>\n",
+       "      <td>1.028400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2500</td>\n",
+       "      <td>1.008500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2550</td>\n",
+       "      <td>0.942800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2600</td>\n",
+       "      <td>0.849700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2650</td>\n",
+       "      <td>0.771400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2700</td>\n",
+       "      <td>0.794100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2750</td>\n",
+       "      <td>0.819200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2800</td>\n",
+       "      <td>0.937500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2850</td>\n",
+       "      <td>1.064500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2900</td>\n",
+       "      <td>1.189300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2950</td>\n",
+       "      <td>1.071100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3000</td>\n",
+       "      <td>1.003300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3050</td>\n",
+       "      <td>1.073900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3100</td>\n",
+       "      <td>1.043100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3150</td>\n",
+       "      <td>1.282600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3200</td>\n",
+       "      <td>2.145400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3250</td>\n",
+       "      <td>1.925800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3300</td>\n",
+       "      <td>2.005600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3350</td>\n",
+       "      <td>2.122600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3400</td>\n",
+       "      <td>2.163000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3450</td>\n",
+       "      <td>2.046600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3500</td>\n",
+       "      <td>2.152200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3550</td>\n",
+       "      <td>2.151700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3600</td>\n",
+       "      <td>5.394900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3650</td>\n",
+       "      <td>4.677800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3700</td>\n",
+       "      <td>4.122200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3750</td>\n",
+       "      <td>3.710200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3800</td>\n",
+       "      <td>3.350800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3850</td>\n",
+       "      <td>3.126300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3900</td>\n",
+       "      <td>2.988700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3950</td>\n",
+       "      <td>2.872000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4000</td>\n",
+       "      <td>2.848200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4050</td>\n",
+       "      <td>2.823900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4100</td>\n",
+       "      <td>2.781200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4150</td>\n",
+       "      <td>2.735000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4200</td>\n",
+       "      <td>2.725900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4250</td>\n",
+       "      <td>2.644400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4300</td>\n",
+       "      <td>2.700000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4350</td>\n",
+       "      <td>2.650100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4400</td>\n",
+       "      <td>2.704500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4450</td>\n",
+       "      <td>2.596700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4500</td>\n",
+       "      <td>2.510500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4550</td>\n",
+       "      <td>2.515800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4600</td>\n",
+       "      <td>2.498100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4650</td>\n",
+       "      <td>2.458900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4700</td>\n",
+       "      <td>2.449700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4750</td>\n",
+       "      <td>2.425000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4800</td>\n",
+       "      <td>2.362300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4850</td>\n",
+       "      <td>2.232000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4900</td>\n",
+       "      <td>2.361500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4950</td>\n",
+       "      <td>2.302300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5000</td>\n",
+       "      <td>2.333900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5050</td>\n",
+       "      <td>2.367200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5100</td>\n",
+       "      <td>2.288300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5150</td>\n",
+       "      <td>2.426100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5200</td>\n",
+       "      <td>2.344100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5250</td>\n",
+       "      <td>2.283500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5300</td>\n",
+       "      <td>2.296500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new3/commit/b9472b66316be8654c6f7c173fa4561889bd3446', commit_message='End of training', commit_description='', oid='b9472b66316be8654c6f7c173fa4561889bd3446', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KSU-HW-SEC/r1q1.5_graph_lora_new3', endpoint='https://huggingface.co', repo_type='model', repo_id='KSU-HW-SEC/r1q1.5_graph_lora_new3'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "train_data = torch.load(\"train_data.pt\",weights_only=False)\n",
+    "print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "if 'train_data' not in globals():\n",
+    "    train_data_path = \"train_data.pt\"\n",
+    "    \n",
+    "    if os.path.exists(train_data_path):  #确保文件存在\n",
+    "        train_data = torch.load(train_data_path, weights_only=False)\n",
+    "        print(\"train_data 重新加载成功，数据量:\", len(train_data))\n",
+    "    else:\n",
+    "        print(f\"未找到 {train_data_path}，请检查路径！\")\n",
+    "        exit()\n",
+    "#检查是否已经定义了 MODEL_NAME，否则赋值默认值\n",
+    "if \"MODEL_NAME\" not in globals():\n",
+    "    MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"  # 默认模型\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from transformers import Trainer, TrainingArguments, AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "class GraphDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        sample = self.data[idx]\n",
+    "        return {\n",
+    "            \"input_ids\": sample[\"input_ids\"],\n",
+    "            \"attention_mask\": sample[\"attention_mask\"],\n",
+    "            \"graph_embedding\": sample[\"graph_embedding\"],  # 额外输入\n",
+    "            \"labels\": sample[\"labels\"],\n",
+    "        }\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path, num_heads=8):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 LLM 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        # ✅ 1. 线性变换，将 `graph_embedding` 从 512 维映射到 `hidden_size`\n",
+    "        self.linear1 = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力层\n",
+    "        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.config.hidden_size, num_heads=num_heads, batch_first=True)\n",
+    "\n",
+    "        # ✅ 3. 线性变换\n",
+    "        self.linear2 = nn.Linear(self.config.hidden_size, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 残差连接 + LayerNorm\n",
+    "        self.norm = nn.LayerNorm(self.config.hidden_size)\n",
+    "    \n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 1. 线性变换 `graph_embedding`\n",
+    "        graph_embedding_token = self.linear1(graph_embedding)  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力计算（自注意力机制）\n",
+    "        attn_output, _ = self.multihead_attn(graph_embedding_token, graph_embedding_token, graph_embedding_token)\n",
+    "        \n",
+    "        # ✅ 3. 线性层 + 残差连接\n",
+    "        graph_embedding_token = self.linear2(attn_output) + graph_embedding_token  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 归一化\n",
+    "        graph_embedding_token = self.norm(graph_embedding_token)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "from transformers import Trainer\n",
+    "\n",
+    "class GraphTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
+    "        input_ids = inputs[\"input_ids\"]\n",
+    "        attention_mask = inputs[\"attention_mask\"]\n",
+    "        labels = inputs[\"labels\"]\n",
+    "        graph_embedding = inputs.get(\"graph_embedding\", None)  \n",
+    "\n",
+    "        if graph_embedding is not None:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "                graph_embedding=graph_embedding,  \n",
+    "            )\n",
+    "        else:\n",
+    "            outputs = model(\n",
+    "                input_ids=input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                labels=labels,\n",
+    "            )\n",
+    "\n",
+    "        loss = outputs.loss\n",
+    "        return (loss, outputs) if return_outputs else loss\n",
+    "\n",
+    "\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "# ✅ 载入微调模型\n",
+    "model = GraphAwareLM.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# ✅ 训练参数\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results3\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    eval_strategy=\"no\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=3000,\n",
+    "    logging_steps=50,\n",
+    "    bf16=True,\n",
+    "    optim=\"galore_adamw\",\n",
+    "    optim_target_modules=\"all-linear\",  # ✅ 让 GaLore 作用于所有线性层\n",
+    "    optim_args=\"rank=128,scale=2.0\",  # ✅ 低秩分解参数\n",
+    "    warmup_steps=1000,\n",
+    "    num_train_epochs=3,\n",
+    "    push_to_hub=True,\n",
+    "    hub_model_id=HF_NAME,\n",
+    "    hub_strategy=\"every_save\",\n",
+    "    run_name = \"experi030403\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ✅ 转换 `train_data` 为 `Dataset`\n",
+    "train_dataset = GraphDataset(train_data)\n",
+    "\n",
+    "# ✅ 训练\n",
+    "trainer = GraphTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.save_model(\"/workspace/model3\")\n",
+    "trainer.push_to_hub()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7a72ac3b-561e-41d3-ae93-99f20acf3188",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora_new2-3000', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora_new2-3000')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()\n",
+    "repo_name = \"r1q1.5_graph_lora-results3\"  # 你的模型名称\n",
+    "api.create_repo(repo_name, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "73c434b9-5d58-4819-8526-24aa18ca1010",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b896f21685e4086b0b59404b2b1a866",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d20bff067ca44c4583378181da817897",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c4b7114a53b341539a3244f2eea8aacf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74c6045017b640bdba86fe3ed1bb9c92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97436b084bc4420f8b273ec462c50e61",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "optimizer.pt:   0%|          | 0.00/4.32G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7f10ccff3674e6fa8bcb42553c12b19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c5b1a010fd0845f9ba9112291afa8f17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/YiFzhao/r1q1.5_graph_lora_new2-3000/commit/4088de651a0ce2cc39fcb0c950898e54ce91bdea', commit_message='upload checkpoint-3000', commit_description='', oid='4088de651a0ce2cc39fcb0c950898e54ce91bdea', pr_url=None, repo_url=RepoUrl('https://huggingface.co/YiFzhao/r1q1.5_graph_lora_new2-3000', endpoint='https://huggingface.co', repo_type='model', repo_id='YiFzhao/r1q1.5_graph_lora_new2-3000'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import upload_folder\n",
+    "\n",
+    "upload_folder(\n",
+    "    folder_path = \"/workspace/results3\",\n",
+    "    repo_id = \"YiFzhao/r1q1.5_graph_lora-results3\",\n",
+    "    commit_message = \"upload results2\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d2ebf87-402e-444d-8599-96c313f1b7fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🚀 处理后数据条数: 12384\n",
+      "✅ 示例数据: {'input_ids': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]), 'labels': tensor([151643, 151643, 151643,  ...,   1493,   7525,    624]), 'graph_embedding': tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+      "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+      "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+      "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+      "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+      "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+      "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+      "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+      "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+      "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+      "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+      "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+      "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+      "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+      "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+      "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+      "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+      "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+      "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+      "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+      "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+      "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+      "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+      "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+      "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+      "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+      "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+      "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+      "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+      "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+      "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+      "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+      "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+      "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+      "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+      "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+      "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+      "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+      "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+      "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+      "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+      "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+      "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+      "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+      "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+      "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+      "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+      "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+      "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+      "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+      "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+      "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+      "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+      "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+      "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+      "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+      "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+      "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+      "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+      "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+      "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+      "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+      "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+      "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635])}\n",
+      "✅ train_data 已保存到 train_data.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "tokenizer.pad_token = tokenizer.eos_token  \n",
+    "\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "\n",
+    "for sample in data:\n",
+    "    conversations = sample.get(\"conversations\", [])\n",
+    "    embeddings = sample.get(\"embedding\", []) \n",
+    "\n",
+    "    if not isinstance(embeddings, list) or len(embeddings) == 0:\n",
+    "        print(f\"无效的 embedding，跳过样本：{sample}\")\n",
+    "        continue\n",
+    "\n",
+    "    graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0)  # [512]\n",
+    "\n",
+    "    #拼接所有对话\n",
+    "    dialogue_text = \"\"\n",
+    "    for conv in conversations:\n",
+    "        role = conv[\"from\"]  # \"human\" 或 \"gpt\"\n",
+    "        content = conv[\"value\"]\n",
+    "        content = content.replace(\"<image>\", \"\") #去掉 <image>\n",
+    "        role_token = ROLE_TOKENS.get(role, f\"<|{role}|>\")  # 兼容性处理\n",
+    "        dialogue_text += f\"{role_token} {content}\\n\"\n",
+    "\n",
+    "    tokenized = tokenizer(\n",
+    "        dialogue_text,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=max_seq_length - GRAPH_LENGTH,  # 预留 graph embedding 空间\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    input_ids = tokenized[\"input_ids\"].squeeze(0)\n",
+    "    attention_mask = tokenized[\"attention_mask\"].squeeze(0)\n",
+    "\n",
+    "    train_data.append({\n",
+    "        \"input_ids\": input_ids,\n",
+    "        \"attention_mask\": attention_mask,\n",
+    "        \"labels\": input_ids.clone(),\n",
+    "        \"graph_embedding\": graph_embedding,  # `graph_embedding` 存入\n",
+    "    })\n",
+    "\n",
+    "print(\"🚀 处理后数据条数:\", len(train_data))\n",
+    "print(\"✅ 示例数据:\", train_data[0])\n",
+    "torch.save(train_data, \"train_data.pt\")\n",
+    "print(\"✅ train_data 已保存到 train_data.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "05a48aa8-c597-4ff1-9569-aa210f4f1f5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoConfig\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class GraphAwareLM(AutoModelForCausalLM):\n",
+    "    def __init__(self, pretrained_model_name_or_path, num_heads=8):\n",
+    "        super().__init__(AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path).config)\n",
+    "        \n",
+    "        # ✅ 载入 LLM 预训练模型\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)\n",
+    "\n",
+    "        # ✅ 1. 线性变换，将 `graph_embedding` 从 512 维映射到 `hidden_size`\n",
+    "        self.linear1 = nn.Linear(512, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力层\n",
+    "        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.config.hidden_size, num_heads=num_heads, batch_first=True)\n",
+    "\n",
+    "        # ✅ 3. 线性变换\n",
+    "        self.linear2 = nn.Linear(self.config.hidden_size, self.config.hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 残差连接 + LayerNorm\n",
+    "        self.norm = nn.LayerNorm(self.config.hidden_size)\n",
+    "    \n",
+    "\n",
+    "    def forward(self, input_ids=None, attention_mask=None, labels=None, graph_embedding=None):\n",
+    "        \"\"\"\n",
+    "        `graph_embedding` 形状: (batch_size, 512)\n",
+    "        `input_ids` 形状: (batch_size, seq_len)\n",
+    "        \"\"\"\n",
+    "        # ✅ 获取 token embedding\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # (batch_size, seq_len, hidden_size)\n",
+    "\n",
+    "        # ✅ 1. 线性变换 `graph_embedding`\n",
+    "        graph_embedding_token = self.linear1(graph_embedding)  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 2. 多头注意力计算（自注意力机制）\n",
+    "        attn_output, _ = self.multihead_attn(graph_embedding_token, graph_embedding_token, graph_embedding_token)\n",
+    "        \n",
+    "        # ✅ 3. 线性层 + 残差连接\n",
+    "        graph_embedding_token = self.linear2(attn_output) + graph_embedding_token  # (batch_size, 1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 归一化\n",
+    "        graph_embedding_token = self.norm(graph_embedding_token)\n",
+    "\n",
+    "        # ✅ 在 `inputs_embeds` 前面拼接 graph_embedding\n",
+    "        graph_embedding_token = graph_embedding_token.unsqueeze(1)  # (batch_size, 1, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (batch_size, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 调整 attention mask\n",
+    "        if attention_mask is not None:\n",
+    "            graph_mask = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, attention_mask], dim=1)  # (batch_size, seq_len+1)\n",
+    "\n",
+    "        # ✅ 传入模型\n",
+    "        outputs = self.model(\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            attention_mask=attention_mask,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    def generate(self, inputs, graph_embedding, max_length=500, temperature=0.7, top_k=50, top_p=0.9):\n",
+    "        \"\"\"\n",
+    "        ✅ 自定义 `generate()` 方法，支持 `graph_embedding`\n",
+    "        `input_text`: 需要生成文本的输入\n",
+    "        `graph_embedding`: 形状为 (1, 512) 的张量\n",
+    "        \"\"\"\n",
+    "\n",
+    "        # ✅ 2. 处理 `graph_embedding`\n",
+    "        graph_embedding_token = self.linear1(graph_embedding)  # (1, 1, hidden_size)\n",
+    "        attn_output, _ = self.multihead_attn(graph_embedding_token, graph_embedding_token, graph_embedding_token)\n",
+    "        graph_embedding_token = self.linear2(attn_output) + graph_embedding_token  # (1, 1, hidden_size)\n",
+    "        graph_embedding_token = self.norm(graph_embedding_token)\n",
+    "\n",
+    "        # ✅ 3. 获取 Token Embeddings 并拼接\n",
+    "        inputs_embeds = self.model.get_input_embeddings()(inputs[\"input_ids\"])  # (1, seq_len, hidden_size)\n",
+    "        inputs_embeds = torch.cat([graph_embedding_token, inputs_embeds], dim=1)  # (1, seq_len+1, hidden_size)\n",
+    "\n",
+    "        # ✅ 4. 调整 `attention_mask`\n",
+    "        if \"attention_mask\" in inputs:\n",
+    "            graph_mask = torch.ones((inputs[\"attention_mask\"].shape[0], 1), device=inputs[\"attention_mask\"].device, dtype=inputs[\"attention_mask\"].dtype)\n",
+    "            attention_mask = torch.cat([graph_mask, inputs[\"attention_mask\"]], dim=1)  # (1, seq_len+1)\n",
+    "        else:\n",
+    "            attention_mask = None\n",
+    "\n",
+    "        # ✅ 5. 进行文本生成\n",
+    "        with torch.no_grad():\n",
+    "            output_ids = self.model.generate(\n",
+    "                inputs_embeds=inputs_embeds,\n",
+    "                attention_mask=attention_mask,\n",
+    "                max_length=max_length,\n",
+    "                temperature=temperature,\n",
+    "                top_k=top_k,\n",
+    "                top_p=top_p,\n",
+    "                num_return_sequences=1\n",
+    "            )\n",
+    "\n",
+    "        # ✅ 6. 解码输出\n",
+    "        generated_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "        return generated_text\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):\n",
+    "        # ✅ 1. 调用 `super().from_pretrained()` 加载 LLM\n",
+    "        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)\n",
+    "\n",
+    "        # ✅ 2. 初始化 `MLP + MultiheadAttention` 结构\n",
+    "        model.linear1 = nn.Linear(512, model.config.hidden_size)\n",
+    "        model.multihead_attn = nn.MultiheadAttention(embed_dim=model.config.hidden_size, num_heads=8, batch_first=True)\n",
+    "        model.linear2 = nn.Linear(model.config.hidden_size, model.config.hidden_size)\n",
+    "        model.norm = nn.LayerNorm(model.config.hidden_size)\n",
+    "\n",
+    "        return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "73ae15d9-c9d9-4e64-ac8b-2d5877eac984",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "21c8df04-0dc2-436c-aaaf-74a885f734d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b50f0cd6d784f598cc64a40cff40f38",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Qwen2ForCausalLM(\n",
+       "  (model): Qwen2Model(\n",
+       "    (embed_tokens): Embedding(151936, 1536)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-27): 28 x Qwen2DecoderLayer(\n",
+       "        (self_attn): Qwen2Attention(\n",
+       "          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "          (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
+       "          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
+       "        )\n",
+       "        (mlp): Qwen2MLP(\n",
+       "          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
+       "          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
+       "    (rotary_emb): Qwen2RotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
+       "  (linear1): Linear(in_features=512, out_features=1536, bias=True)\n",
+       "  (multihead_attn): MultiheadAttention(\n",
+       "    (out_proj): NonDynamicallyQuantizableLinear(in_features=1536, out_features=1536, bias=True)\n",
+       "  )\n",
+       "  (linear2): Linear(in_features=1536, out_features=1536, bias=True)\n",
+       "  (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# 加载 tokenizer\n",
+    "MODEL_NAME = \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "\n",
+    "# 加载训练好的模型\n",
+    "model_path = \"/workspace/model2\"\n",
+    "model = GraphAwareLM.from_pretrained(\"/workspace/results3/checkpoint-3000\").to(device)\n",
+    "model.eval()  # 设置为推理模式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "51995891-8906-4049-9401-2d22e06a84e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([[-0.0380, -0.0350, -0.0423,  ...,  0.0213,  0.0148, -0.0047],\n",
+      "        [ 0.0131,  0.0388, -0.0378,  ...,  0.0399, -0.0309, -0.0342],\n",
+      "        [ 0.0084, -0.0116,  0.0259,  ...,  0.0344,  0.0268, -0.0062],\n",
+      "        ...,\n",
+      "        [ 0.0080, -0.0073, -0.0023,  ..., -0.0120,  0.0387,  0.0209],\n",
+      "        [ 0.0277,  0.0326,  0.0270,  ...,  0.0124, -0.0348,  0.0389],\n",
+      "        [ 0.0184, -0.0410, -0.0415,  ...,  0.0255, -0.0429, -0.0386]],\n",
+      "       device='cuda:0', requires_grad=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.graph_proj.weight)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7a8562c0-8d55-4412-8f89-de20bae0f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "json_path = \"final_Graph.json\"\n",
+    "with open(json_path, \"r\") as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "test_data = data[0]\n",
+    "\n",
+    "conversations = test_data.get(\"conversations\")\n",
+    "embeddings = test_data.get(\"embedding\") \n",
+    "\n",
+    "graph_embedding = torch.tensor(embeddings, dtype=torch.float32).squeeze(0).to(device)\n",
+    "\n",
+    "question1 = conversations[0][\"value\"].replace(\"<image>\", \"\").strip()\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# ✅ 输入文本\n",
+    "ROLE_TOKENS = {\n",
+    "    \"human\": \"<|User|>\",     \n",
+    "    \"gpt\": \"<|Assistant|>\",   \n",
+    "}\n",
+    "GRAPH_LENGTH = 512\n",
+    "max_seq_length = 1100 + GRAPH_LENGTH\n",
+    "inputs = tokenizer(question1, return_tensors=\"pt\",truncation=True,max_length=max_seq_length - GRAPH_LENGTH).to(device)\n",
+    "\n",
+    "input_ids = inputs[\"input_ids\"]\n",
+    "attention_mask = inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4bd7493f-ca8d-4c28-914d-95b1c30f8fcc",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Tensor' object has no attribute 'update'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m generated_text \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_embedding\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1982\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m   1979\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)  \u001b[38;5;66;03m# Pull this out first, we only use it for stopping criteria\u001b[39;00m\n\u001b[1;32m   1980\u001b[0m assistant_tokenizer \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massistant_tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)  \u001b[38;5;66;03m# only used for assisted generation\u001b[39;00m\n\u001b[0;32m-> 1982\u001b[0m generation_config, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_generation_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1983\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_model_kwargs(model_kwargs\u001b[38;5;241m.\u001b[39mcopy())\n\u001b[1;32m   1984\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_assistant(assistant_model, tokenizer, assistant_tokenizer)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1549\u001b[0m, in \u001b[0;36mGenerationMixin._prepare_generation_config\u001b[0;34m(self, generation_config, **kwargs)\u001b[0m\n\u001b[1;32m   1547\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torchdynamo_compiling():\n\u001b[1;32m   1548\u001b[0m     generation_config \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(generation_config)\n\u001b[0;32m-> 1549\u001b[0m     model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupdate\u001b[49m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1550\u001b[0m     \u001b[38;5;66;03m# If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model\u001b[39;00m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m using_model_generation_config:\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Tensor' object has no attribute 'update'"
+     ]
+    }
+   ],
+   "source": [
+    "generated_text = model.generate(inputs, graph_embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62f40327-f102-4259-80a5-8761d5d7d3c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-2.4214, -0.5552,  1.0389, -1.3428, -0.1341,  0.6100, -0.4200, -1.8584,\n",
+       "        -0.2880, -0.4779,  0.3452, -0.8934, -0.9216,  0.5600,  0.2474, -0.9009,\n",
+       "        -1.0995,  0.6065,  1.7662, -1.2281,  0.0000, -1.9196,  0.1920, -1.2770,\n",
+       "        -0.6918, -1.3762, -0.7639, -0.1023,  2.5149,  1.1990, -0.2678, -0.7488,\n",
+       "        -0.0000,  0.9108,  0.2010, -0.2639,  0.5023, -0.8752,  0.2083,  0.5740,\n",
+       "         0.3758, -0.7036, -1.3210, -0.8119, -0.5329, -0.2355, -0.2750,  1.6133,\n",
+       "        -2.3233,  0.3174,  0.0000,  0.5769,  0.3558,  0.2234, -0.0666, -0.6310,\n",
+       "        -0.3533,  0.9497, -0.9576,  0.1615, -0.0460, -1.1686,  1.4337, -1.2952,\n",
+       "        -1.1095,  0.5081, -1.9626, -0.3278,  0.7837, -2.4616,  0.3936, -0.3157,\n",
+       "        -1.6531, -0.0708, -0.6630,  0.4285,  0.1360, -0.7986, -0.1449,  0.0000,\n",
+       "         0.9076,  0.7794,  0.6391,  0.9840,  0.2970,  1.5463,  1.1554, -0.5432,\n",
+       "         0.7202,  0.0000, -0.2380,  0.0422,  0.0000,  0.4296,  0.2068,  0.3330,\n",
+       "        -0.5888,  0.0000,  1.0656, -0.2724,  0.7562, -0.6863, -1.6948, -0.1634,\n",
+       "         1.8262,  1.4235,  0.9178, -0.7475, -0.2682,  0.5534,  1.5643, -0.9898,\n",
+       "        -0.2911,  1.3752,  0.6331, -0.1162,  1.7250,  0.8486, -0.0000, -1.6454,\n",
+       "        -4.2099, -0.1101,  0.9528, -0.1335,  0.1057,  0.2624,  2.4600,  1.2772,\n",
+       "        -3.6113, -1.6540,  1.7807, -0.5077,  0.4537,  1.0987, -0.0713,  0.1391,\n",
+       "        -0.0000, -1.3129,  0.5611, -0.3687, -0.7690,  0.0190,  0.9332, -0.4274,\n",
+       "        -0.4125, -0.6608,  0.4810, -0.6759, -0.8501,  0.0000, -1.6998,  0.3269,\n",
+       "         0.0334, -0.8513, -0.8695, -0.2957, -2.1983,  1.1621,  0.1864,  0.6089,\n",
+       "         0.4840, -0.6849,  0.2127,  0.7035, -2.9177,  2.2954, -2.0283, -2.1883,\n",
+       "        -0.0000,  0.1591,  1.3046, -0.0000,  0.2811,  0.0935, -1.0028,  0.8179,\n",
+       "         1.5387,  0.5271,  0.2195, -0.0882, -1.3943,  0.8263,  0.7164,  0.6240,\n",
+       "         0.7027, -0.5830, -1.2238, -0.0000,  0.5721,  0.0000,  0.3103,  0.7294,\n",
+       "        -0.0224,  2.8884, -0.0000, -0.0000,  2.1562, -0.6177,  1.5242, -0.0000,\n",
+       "        -0.9023, -0.0000,  1.9196, -0.9594, -0.7334,  0.6636,  0.0000,  0.5613,\n",
+       "        -0.3294,  1.1782, -0.8789,  1.6285,  0.3845,  0.1210,  1.3321,  0.5566,\n",
+       "        -0.4729,  1.9552, -0.6409,  1.1379, -0.0000,  1.2146, -0.7578, -0.3764,\n",
+       "        -0.0823, -1.7541, -0.1362, -0.1631, -0.6794,  1.2874,  0.2402,  0.0000,\n",
+       "         2.3540, -0.5574, -0.9901,  0.3435,  0.6318, -0.3071, -0.6270, -1.8417,\n",
+       "        -1.9213, -0.4928,  0.1969, -1.2195, -0.1594, -1.1694,  1.9461,  1.4360,\n",
+       "        -0.4050,  1.3495,  0.3053, -0.3500, -0.1546, -0.4096,  0.8011, -0.5379,\n",
+       "        -0.1322,  0.0000,  1.7025, -0.0000, -0.7611,  1.4174, -1.0466, -0.8641,\n",
+       "         0.3074, -0.9910,  0.0000,  1.2856, -0.3916, -1.4133, -1.2143, -1.1373,\n",
+       "        -0.4996, -0.3315,  1.6280,  0.1051,  0.3570,  2.4021, -0.0249,  0.8169,\n",
+       "        -0.4497, -1.4486, -0.0000, -0.7351, -0.3337,  0.2480, -0.5413,  2.2289,\n",
+       "         1.6903,  0.7866,  0.6164,  0.8920, -1.1745, -0.3534, -0.4512,  0.0000,\n",
+       "        -0.3795, -1.2503, -0.5114,  1.6374,  1.3271,  1.8410,  0.1040,  0.9731,\n",
+       "        -0.3357,  2.4072, -0.0000,  1.9666, -0.5907,  1.0771,  1.6236, -0.9991,\n",
+       "        -0.0282,  0.6689, -1.0429,  0.9279,  0.0000, -0.1722, -1.0940, -1.1756,\n",
+       "        -0.2457, -1.1142, -1.5693,  1.7408,  1.8951, -1.5109, -0.3783, -0.4719,\n",
+       "        -0.7410, -0.2575,  0.0000, -0.8207, -0.6377, -1.2434,  0.4213, -2.1689,\n",
+       "         1.1191,  0.8991, -0.7343, -0.0000,  0.1287, -1.0638, -1.3629, -0.0916,\n",
+       "         0.6016, -1.2285,  2.1858, -0.1274, -0.1246,  0.8666, -0.1599, -0.9024,\n",
+       "        -0.6486,  0.9323,  1.4422, -0.7030,  1.6400,  1.2095,  0.9178, -0.6975,\n",
+       "         1.5239, -1.8692, -2.4644, -0.0000,  1.3411, -0.0351,  1.9389,  1.3991,\n",
+       "        -1.0556, -0.8072,  0.9237,  0.8799,  0.2778, -0.8607,  0.4810, -0.0000,\n",
+       "         0.8293,  0.0735,  2.2176, -0.0000, -0.4048,  0.8768, -1.4589, -2.3772,\n",
+       "        -0.5785,  0.7544, -1.3414,  0.7273, -1.4420,  2.0120, -0.0846, -1.0264,\n",
+       "        -0.8520, -0.3899, -0.0000, -0.5772, -0.1395, -0.8346,  2.7815,  0.3414,\n",
+       "         2.6266,  0.2384,  2.0168,  0.6710,  0.9409, -0.3611,  1.6438, -0.0000,\n",
+       "        -0.8750, -0.1610,  0.8060, -1.5453,  0.3108, -0.6887,  0.0000,  0.3937,\n",
+       "         0.2050, -0.7704,  1.1102,  0.1719, -0.4513, -0.1844,  0.7308, -2.4639,\n",
+       "        -0.1578, -0.5711, -0.4696, -0.8899,  0.0929, -0.2267,  0.1619,  0.7937,\n",
+       "        -0.3767,  0.2024,  0.3893, -0.7677,  1.5729, -0.6239, -0.0000,  0.8411,\n",
+       "         0.6361, -1.1110, -1.2833,  1.0356, -0.9941,  0.5842, -0.7817, -0.5730,\n",
+       "         0.2732, -0.6890, -0.0000, -0.0087,  1.3772,  0.3003,  0.0000,  0.8828,\n",
+       "        -1.7060, -0.9499,  0.0000,  1.2618, -0.1124,  0.9352,  0.5854,  1.1139,\n",
+       "         0.1583,  3.3464, -0.4027,  0.5860, -0.8730, -0.0163, -0.7023,  2.1778,\n",
+       "        -3.2313,  1.5753,  0.8494, -1.3516, -2.2013, -1.6432,  0.2581,  0.2197,\n",
+       "        -0.7742, -0.6365, -2.4008,  1.4902,  0.3697, -0.2428,  0.0000, -0.6978,\n",
+       "        -0.0000,  0.7576,  1.7998,  0.0000, -0.8300, -1.0503,  0.4118,  1.4737,\n",
+       "        -1.0162, -1.1784, -0.3985,  0.1699, -0.0000, -0.6951, -1.5820,  1.2909,\n",
+       "         1.7528,  0.1409, -1.3121,  1.7415,  0.5114, -1.7321,  2.0781,  0.5635],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "067a0cf7-3010-4b6b-b2aa-d4ce95010d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "模型回复：  How\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ✅ 进行前向传播\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, graph_embedding=graph_embedding)\n",
+    "\n",
+    "# ✅ 提取 logits 并进行贪心解码\n",
+    "logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "predicted_id = torch.argmax(logits, dim=-1)  # 选择概率最大的 token\n",
+    "\n",
+    "# ✅ 反向编码为文本\n",
+    "response_text = tokenizer.decode(predicted_id, skip_special_tokens=True)\n",
+    "\n",
+    "print(\"模型回复：\", response_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ae38ed68-bc6a-4bc3-aee8-d54d2dd689ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Response: What are the signal definitions in the Verilog code for the calculator module, and what are their purposes? The Verilog code defines the inputs A, B, and C, and the output Y. A and B are the operands, C is the carry-in, and Y is the result. The purpose of the module is to perform a 2-bit adder, which adds two 2-bit numbers, and the output is the sum. The inputs A and B are the operands, C is the carry-in, and Y is the result. The module is designed to handle the addition operation of two 2-bit numbers, with a carry-in, and a 3-bit output. The implementation involves using logic gates to perform the addition operation, with the sum output connected to the gates. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, involving basic gates and an adder circuit. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with no complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is simple, with no need for complex logic gates or delays. The carry-in is used to control whether the carry-out is active or not. The output Y is the result of the addition operation. The implementation is straightforward, with\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_new_tokens = 500\n",
+    "generated_ids = input_ids.clone()\n",
+    "generated_attention_mask = attention_mask.clone()\n",
+    "for _ in range(max_new_tokens):\n",
+    "    # ✅ 计算 logits 并进行生成\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(\n",
+    "            input_ids=generated_ids,        # (batch_size, seq_len)\n",
+    "            attention_mask=generated_attention_mask,  # (batch_size, seq_len)\n",
+    "            graph_embedding=graph_embedding,      # (batch_size, 512)\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "    logits = outputs.logits[:, -1, :]  # 取最后一个 token 的 logits\n",
+    "    next_token = torch.argmax(logits, dim=-1)  # 贪心解码\n",
+    "    # print(next_token)\n",
+    "\n",
+    "\n",
+    "    # ✅ **拼接到已生成序列**\n",
+    "    generated_ids = torch.cat([generated_ids, next_token.unsqueeze(1)], dim=1)\n",
+    "\n",
+    "    # print(generated_ids)\n",
+    "\n",
+    "    if next_token.item() == tokenizer.eos_token_id:\n",
+    "        break\n",
+    "\n",
+    "    generated_attention_mask = torch.cat(\n",
+    "        [generated_attention_mask, torch.ones((1, 1), dtype=generated_attention_mask.dtype, device=generated_attention_mask.device)], dim=1\n",
+    "    ) \n",
+    "\n",
+    "# ✅ 解码最终输出\n",
+    "generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "print(\"Generated Response:\", generated_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "803f41fe-f504-4c2a-96b4-afc2cd437d01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[151646,   3838,    525,    279,   8286,  17473,    304,    279,   6250,\n",
+       "          50773,   2038,    369,    279,  29952,   4688,     11,    323,   1128,\n",
+       "            525,    862,   9895,     30]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d1396b-4d20-4a76-a092-b26a587a76ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

train_data.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c09c4c4be57cf268061c0f20f6e6d877359dd683fbff17f4d3a6b7cffee3dae
+size 364711686