mjschock commited on
Commit
052a6ed
·
unverified ·
1 Parent(s): e68039e

Update unsloth_SmolLM2 notebook to refine model training parameters and output formatting. Adjust dropout rates for better performance, enhance layer configurations, and increase training steps for improved stability. Revise output messages for clarity and update execution counts to reflect changes in training duration.

Browse files
notebooks/unsloth_SmolLM2-135M-Instruct-bnb-4bit_xingyaoww_code-act.ipynb CHANGED
@@ -207,7 +207,7 @@
207
  "name": "stderr",
208
  "output_type": "stream",
209
  "text": [
210
- "Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.\n",
211
  "Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.\n",
212
  "Unsloth 2025.4.3 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.\n"
213
  ]
@@ -216,17 +216,16 @@
216
  "source": [
217
  "model = FastLanguageModel.get_peft_model(\n",
218
  " model,\n",
219
- " r=64, # Increased from 16\n",
220
  " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
221
- " \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
222
- " lora_alpha=128, # 2*r\n",
223
- " lora_dropout=0.1, # Added dropout\n",
224
- " bias = \"none\", # Supports any, but = \"none\" is optimized\n",
225
- " # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
226
- " use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
227
  " random_state = 3407,\n",
228
- " use_rslora = False, # We support rank stabilized LoRA\n",
229
- " loftq_config = None, # And LoftQ\n",
230
  ")"
231
  ]
232
  },
@@ -650,22 +649,25 @@
650
  " dataset_text_field = \"text\",\n",
651
  " max_seq_length = max_seq_length,\n",
652
  " dataset_num_proc = 2,\n",
653
- " packing = False, # Can make training 5x faster for short sequences.\n",
654
  " args = TrainingArguments(\n",
655
  " per_device_train_batch_size = 2,\n",
656
- " gradient_accumulation_steps=8, # Increased from 4\n",
657
- " warmup_steps=50,\n",
658
- " max_steps = 60,\n",
659
- " learning_rate=5e-5, # Reduced from 2e-4\n",
660
  " fp16 = not is_bfloat16_supported(),\n",
661
  " bf16 = is_bfloat16_supported(),\n",
662
  " logging_steps = 1,\n",
663
  " optim = \"adamw_8bit\",\n",
664
  " weight_decay = 0.01,\n",
665
- " lr_scheduler_type=\"cosine\", # Changed to cosine schedule\n",
666
  " seed = 3407,\n",
667
  " output_dir = \"outputs\",\n",
668
- " gradient_checkpointing=True, # Added gradient checkpointing\n",
 
 
 
669
  " ),\n",
670
  ")"
671
  ]
@@ -717,9 +719,9 @@
717
  "output_type": "stream",
718
  "text": [
719
  "==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1\n",
720
- " \\\\ /| Num examples = 7,139 | Num Epochs = 1 | Total steps = 60\n",
721
- "O^O/ \\_/ \\ Batch size per device = 2 | Gradient accumulation steps = 8\n",
722
- "\\ / Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16\n",
723
  " \"-____-\" Trainable parameters = 19,537,920/4,000,000,000 (0.49% trained)\n",
724
  "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n",
725
  "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmjschock\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
@@ -740,7 +742,7 @@
740
  {
741
  "data": {
742
  "text/html": [
743
- "Run data is saved locally in <code>/home/mjschock/Projects/hf-agents-course/agents/Final_Assignment_Template/notebooks/wandb/run-20250430_141858-0ph0gbln</code>"
744
  ],
745
  "text/plain": [
746
  "<IPython.core.display.HTML object>"
@@ -752,7 +754,7 @@
752
  {
753
  "data": {
754
  "text/html": [
755
- "Syncing run <strong><a href='https://wandb.ai/mjschock/huggingface/runs/0ph0gbln' target=\"_blank\">outputs</a></strong> to <a href='https://wandb.ai/mjschock/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
756
  ],
757
  "text/plain": [
758
  "<IPython.core.display.HTML object>"
@@ -776,7 +778,7 @@
776
  {
777
  "data": {
778
  "text/html": [
779
- " View run at <a href='https://wandb.ai/mjschock/huggingface/runs/0ph0gbln' target=\"_blank\">https://wandb.ai/mjschock/huggingface/runs/0ph0gbln</a>"
780
  ],
781
  "text/plain": [
782
  "<IPython.core.display.HTML object>"
@@ -798,8 +800,8 @@
798
  "\n",
799
  " <div>\n",
800
  " \n",
801
- " <progress value='60' max='60' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
802
- " [60/60 3:06:36, Epoch 0/1]\n",
803
  " </div>\n",
804
  " <table border=\"1\" class=\"dataframe\">\n",
805
  " <thead>\n",
@@ -811,243 +813,483 @@
811
  " <tbody>\n",
812
  " <tr>\n",
813
  " <td>1</td>\n",
814
- " <td>1.661700</td>\n",
815
  " </tr>\n",
816
  " <tr>\n",
817
  " <td>2</td>\n",
818
- " <td>1.693100</td>\n",
819
  " </tr>\n",
820
  " <tr>\n",
821
  " <td>3</td>\n",
822
- " <td>1.747500</td>\n",
823
  " </tr>\n",
824
  " <tr>\n",
825
  " <td>4</td>\n",
826
- " <td>1.862600</td>\n",
827
  " </tr>\n",
828
  " <tr>\n",
829
  " <td>5</td>\n",
830
- " <td>1.704500</td>\n",
831
  " </tr>\n",
832
  " <tr>\n",
833
  " <td>6</td>\n",
834
- " <td>1.727900</td>\n",
835
  " </tr>\n",
836
  " <tr>\n",
837
  " <td>7</td>\n",
838
- " <td>1.869500</td>\n",
839
  " </tr>\n",
840
  " <tr>\n",
841
  " <td>8</td>\n",
842
- " <td>1.570300</td>\n",
843
  " </tr>\n",
844
  " <tr>\n",
845
  " <td>9</td>\n",
846
- " <td>1.910300</td>\n",
847
  " </tr>\n",
848
  " <tr>\n",
849
  " <td>10</td>\n",
850
- " <td>1.944300</td>\n",
851
  " </tr>\n",
852
  " <tr>\n",
853
  " <td>11</td>\n",
854
- " <td>1.830000</td>\n",
855
  " </tr>\n",
856
  " <tr>\n",
857
  " <td>12</td>\n",
858
- " <td>1.715700</td>\n",
859
  " </tr>\n",
860
  " <tr>\n",
861
  " <td>13</td>\n",
862
- " <td>1.647800</td>\n",
863
  " </tr>\n",
864
  " <tr>\n",
865
  " <td>14</td>\n",
866
- " <td>1.842400</td>\n",
867
  " </tr>\n",
868
  " <tr>\n",
869
  " <td>15</td>\n",
870
- " <td>1.741600</td>\n",
871
  " </tr>\n",
872
  " <tr>\n",
873
  " <td>16</td>\n",
874
- " <td>1.753800</td>\n",
875
  " </tr>\n",
876
  " <tr>\n",
877
  " <td>17</td>\n",
878
- " <td>1.703500</td>\n",
879
  " </tr>\n",
880
  " <tr>\n",
881
  " <td>18</td>\n",
882
- " <td>1.700800</td>\n",
883
  " </tr>\n",
884
  " <tr>\n",
885
  " <td>19</td>\n",
886
- " <td>1.955300</td>\n",
887
  " </tr>\n",
888
  " <tr>\n",
889
  " <td>20</td>\n",
890
- " <td>1.645000</td>\n",
891
  " </tr>\n",
892
  " <tr>\n",
893
  " <td>21</td>\n",
894
- " <td>1.663200</td>\n",
895
  " </tr>\n",
896
  " <tr>\n",
897
  " <td>22</td>\n",
898
- " <td>1.637100</td>\n",
899
  " </tr>\n",
900
  " <tr>\n",
901
  " <td>23</td>\n",
902
- " <td>1.586000</td>\n",
903
  " </tr>\n",
904
  " <tr>\n",
905
  " <td>24</td>\n",
906
- " <td>1.813300</td>\n",
907
  " </tr>\n",
908
  " <tr>\n",
909
  " <td>25</td>\n",
910
- " <td>1.541500</td>\n",
911
  " </tr>\n",
912
  " <tr>\n",
913
  " <td>26</td>\n",
914
- " <td>1.658000</td>\n",
915
  " </tr>\n",
916
  " <tr>\n",
917
  " <td>27</td>\n",
918
- " <td>1.666100</td>\n",
919
  " </tr>\n",
920
  " <tr>\n",
921
  " <td>28</td>\n",
922
- " <td>1.659300</td>\n",
923
  " </tr>\n",
924
  " <tr>\n",
925
  " <td>29</td>\n",
926
- " <td>1.639000</td>\n",
927
  " </tr>\n",
928
  " <tr>\n",
929
  " <td>30</td>\n",
930
- " <td>1.797200</td>\n",
931
  " </tr>\n",
932
  " <tr>\n",
933
  " <td>31</td>\n",
934
- " <td>1.498100</td>\n",
935
  " </tr>\n",
936
  " <tr>\n",
937
  " <td>32</td>\n",
938
- " <td>1.596800</td>\n",
939
  " </tr>\n",
940
  " <tr>\n",
941
  " <td>33</td>\n",
942
- " <td>1.608000</td>\n",
943
  " </tr>\n",
944
  " <tr>\n",
945
  " <td>34</td>\n",
946
- " <td>1.608700</td>\n",
947
  " </tr>\n",
948
  " <tr>\n",
949
  " <td>35</td>\n",
950
- " <td>1.724600</td>\n",
951
  " </tr>\n",
952
  " <tr>\n",
953
  " <td>36</td>\n",
954
- " <td>1.498700</td>\n",
955
  " </tr>\n",
956
  " <tr>\n",
957
  " <td>37</td>\n",
958
- " <td>1.453600</td>\n",
959
  " </tr>\n",
960
  " <tr>\n",
961
  " <td>38</td>\n",
962
- " <td>1.493500</td>\n",
963
  " </tr>\n",
964
  " <tr>\n",
965
  " <td>39</td>\n",
966
- " <td>1.711300</td>\n",
967
  " </tr>\n",
968
  " <tr>\n",
969
  " <td>40</td>\n",
970
- " <td>1.723000</td>\n",
971
  " </tr>\n",
972
  " <tr>\n",
973
  " <td>41</td>\n",
974
- " <td>1.440200</td>\n",
975
  " </tr>\n",
976
  " <tr>\n",
977
  " <td>42</td>\n",
978
- " <td>1.628000</td>\n",
979
  " </tr>\n",
980
  " <tr>\n",
981
  " <td>43</td>\n",
982
- " <td>1.435800</td>\n",
983
  " </tr>\n",
984
  " <tr>\n",
985
  " <td>44</td>\n",
986
- " <td>1.348700</td>\n",
987
  " </tr>\n",
988
  " <tr>\n",
989
  " <td>45</td>\n",
990
- " <td>1.340700</td>\n",
991
  " </tr>\n",
992
  " <tr>\n",
993
  " <td>46</td>\n",
994
- " <td>1.428200</td>\n",
995
  " </tr>\n",
996
  " <tr>\n",
997
  " <td>47</td>\n",
998
- " <td>1.257900</td>\n",
999
  " </tr>\n",
1000
  " <tr>\n",
1001
  " <td>48</td>\n",
1002
- " <td>1.462600</td>\n",
1003
  " </tr>\n",
1004
  " <tr>\n",
1005
  " <td>49</td>\n",
1006
- " <td>1.520300</td>\n",
1007
  " </tr>\n",
1008
  " <tr>\n",
1009
  " <td>50</td>\n",
1010
- " <td>1.403800</td>\n",
1011
  " </tr>\n",
1012
  " <tr>\n",
1013
  " <td>51</td>\n",
1014
- " <td>1.472400</td>\n",
1015
  " </tr>\n",
1016
  " <tr>\n",
1017
  " <td>52</td>\n",
1018
- " <td>1.245900</td>\n",
1019
  " </tr>\n",
1020
  " <tr>\n",
1021
  " <td>53</td>\n",
1022
- " <td>1.113200</td>\n",
1023
  " </tr>\n",
1024
  " <tr>\n",
1025
  " <td>54</td>\n",
1026
- " <td>1.338300</td>\n",
1027
  " </tr>\n",
1028
  " <tr>\n",
1029
  " <td>55</td>\n",
1030
- " <td>1.375600</td>\n",
1031
  " </tr>\n",
1032
  " <tr>\n",
1033
  " <td>56</td>\n",
1034
- " <td>1.158800</td>\n",
1035
  " </tr>\n",
1036
  " <tr>\n",
1037
  " <td>57</td>\n",
1038
- " <td>1.208400</td>\n",
1039
  " </tr>\n",
1040
  " <tr>\n",
1041
  " <td>58</td>\n",
1042
- " <td>1.160900</td>\n",
1043
  " </tr>\n",
1044
  " <tr>\n",
1045
  " <td>59</td>\n",
1046
- " <td>1.203100</td>\n",
1047
  " </tr>\n",
1048
  " <tr>\n",
1049
  " <td>60</td>\n",
1050
- " <td>1.167000</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051
  " </tr>\n",
1052
  " </tbody>\n",
1053
  "</table><p>"
@@ -1080,8 +1322,8 @@
1080
  "name": "stdout",
1081
  "output_type": "stream",
1082
  "text": [
1083
- "11430.7159 seconds used for training.\n",
1084
- "190.51 minutes used for training.\n",
1085
  "Peak reserved memory = 2.342 GB.\n",
1086
  "Peak reserved memory for training = 2.117 GB.\n",
1087
  "Peak reserved memory % of max memory = 59.442 %.\n",
@@ -1135,7 +1377,7 @@
1135
  {
1136
  "data": {
1137
  "text/plain": [
1138
- "['<|im_start|>user\\nContinue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\\n<|im_start|>assistant\\nThe Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, starting from 1 and 1. To find the first few Fibonacci numbers, we can use a simple iterative approach.\\n\\nWe begin by assuming the first Fibonacci number is 0. Then, we']"
1139
  ]
1140
  },
1141
  "execution_count": 12,
@@ -1196,9 +1438,18 @@
1196
  "<|im_start|>user\n",
1197
  "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\n",
1198
  "<|im_start|>assistant\n",
1199
- "The Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, starting from 1 and 1. To find the first few Fibonacci numbers, we can use a simple iterative approach.\n",
 
 
1200
  "\n",
1201
- "We begin by assuming the first Fibonacci number is 0. Then, we multiply the current Fibonacci number by 1 and add it to the previous one. This results in a new Fibonacci number of 1 + 0 = 1. Next, we multiply the current Fibonacci number by 2 and add it to the previous one. This results in a new Fibonacci number of\n"
 
 
 
 
 
 
 
1202
  ]
1203
  }
1204
  ],
@@ -1290,9 +1541,9 @@
1290
  "<|im_start|>user\n",
1291
  "What is a famous tall tower in Paris?<|im_end|>\n",
1292
  "<|im_start|>assistant\n",
1293
- "The famous Tall Tower in Paris is the Arc de Triomphe, a monumental arch built in 1789 by Napoleon Bonaparte to commemorate the 18th birthday of his wife, Marie Antoinette. It is a symbol of the French monarchy and a symbol of the French Empire. The Arc de Triomphe is 140 meters tall and 150 meters wide, with a height of 150 meters. It is located in the Latin Quarter of Paris, a historic neighborhood that has been home to the French monarchy since the 18th century.\n",
1294
  "\n",
1295
- "The Arc de Triomphe\n"
1296
  ]
1297
  }
1298
  ],
 
207
  "name": "stderr",
208
  "output_type": "stream",
209
  "text": [
210
+ "Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.\n",
211
  "Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.\n",
212
  "Unsloth 2025.4.3 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.\n"
213
  ]
 
216
  "source": [
217
  "model = FastLanguageModel.get_peft_model(\n",
218
  " model,\n",
219
+ " r=64, # Keep at 64 as it's working well\n",
220
  " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
221
+ " \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
222
+ " lora_alpha=128, # Keep at 2*r as it's working well\n",
223
+ " lora_dropout=0.05, # Reduced from 0.1 to 0.05 for better balance\n",
224
+ " bias = \"none\", # Keep as is for optimization\n",
225
+ " use_gradient_checkpointing = \"unsloth\",\n",
 
226
  " random_state = 3407,\n",
227
+ " use_rslora = True, # Enable rank stabilized LoRA for better stability\n",
228
+ " loftq_config = None,\n",
229
  ")"
230
  ]
231
  },
 
649
  " dataset_text_field = \"text\",\n",
650
  " max_seq_length = max_seq_length,\n",
651
  " dataset_num_proc = 2,\n",
652
+ " packing = False,\n",
653
  " args = TrainingArguments(\n",
654
  " per_device_train_batch_size = 2,\n",
655
+ " gradient_accumulation_steps=16, # Increased from 8 for better stability\n",
656
+ " warmup_steps=100, # Increased from 50 for more gradual warmup\n",
657
+ " max_steps = 120, # Increased from 60 to allow more training\n",
658
+ " learning_rate=5e-5, # Keep at 5e-5 as it's working well\n",
659
  " fp16 = not is_bfloat16_supported(),\n",
660
  " bf16 = is_bfloat16_supported(),\n",
661
  " logging_steps = 1,\n",
662
  " optim = \"adamw_8bit\",\n",
663
  " weight_decay = 0.01,\n",
664
+ " lr_scheduler_type=\"cosine_with_restarts\", # Changed to cosine with restarts for better optimization\n",
665
  " seed = 3407,\n",
666
  " output_dir = \"outputs\",\n",
667
+ " gradient_checkpointing=True,\n",
668
+ " save_strategy=\"steps\", # Add checkpointing\n",
669
+ " save_steps=30, # Save every 30 steps\n",
670
+ " save_total_limit=2, # Keep last 2 checkpoints\n",
671
  " ),\n",
672
  ")"
673
  ]
 
719
  "output_type": "stream",
720
  "text": [
721
  "==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1\n",
722
+ " \\\\ /| Num examples = 7,139 | Num Epochs = 1 | Total steps = 120\n",
723
+ "O^O/ \\_/ \\ Batch size per device = 2 | Gradient accumulation steps = 16\n",
724
+ "\\ / Data Parallel GPUs = 1 | Total batch size (2 x 16 x 1) = 32\n",
725
  " \"-____-\" Trainable parameters = 19,537,920/4,000,000,000 (0.49% trained)\n",
726
  "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n",
727
  "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmjschock\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
 
742
  {
743
  "data": {
744
  "text/html": [
745
+ "Run data is saved locally in <code>/home/mjschock/Projects/hf-agents-course/agents/Final_Assignment_Template/notebooks/wandb/run-20250430_204355-efwwlovd</code>"
746
  ],
747
  "text/plain": [
748
  "<IPython.core.display.HTML object>"
 
754
  {
755
  "data": {
756
  "text/html": [
757
+ "Syncing run <strong><a href='https://wandb.ai/mjschock/huggingface/runs/efwwlovd' target=\"_blank\">outputs</a></strong> to <a href='https://wandb.ai/mjschock/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
758
  ],
759
  "text/plain": [
760
  "<IPython.core.display.HTML object>"
 
778
  {
779
  "data": {
780
  "text/html": [
781
+ " View run at <a href='https://wandb.ai/mjschock/huggingface/runs/efwwlovd' target=\"_blank\">https://wandb.ai/mjschock/huggingface/runs/efwwlovd</a>"
782
  ],
783
  "text/plain": [
784
  "<IPython.core.display.HTML object>"
 
800
  "\n",
801
  " <div>\n",
802
  " \n",
803
+ " <progress value='120' max='120' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
804
+ " [120/120 12:38:52, Epoch 0/1]\n",
805
  " </div>\n",
806
  " <table border=\"1\" class=\"dataframe\">\n",
807
  " <thead>\n",
 
813
  " <tbody>\n",
814
  " <tr>\n",
815
  " <td>1</td>\n",
816
+ " <td>1.676700</td>\n",
817
  " </tr>\n",
818
  " <tr>\n",
819
  " <td>2</td>\n",
820
+ " <td>1.804600</td>\n",
821
  " </tr>\n",
822
  " <tr>\n",
823
  " <td>3</td>\n",
824
+ " <td>1.718600</td>\n",
825
  " </tr>\n",
826
  " <tr>\n",
827
  " <td>4</td>\n",
828
+ " <td>1.732300</td>\n",
829
  " </tr>\n",
830
  " <tr>\n",
831
  " <td>5</td>\n",
832
+ " <td>1.931200</td>\n",
833
  " </tr>\n",
834
  " <tr>\n",
835
  " <td>6</td>\n",
836
+ " <td>1.780200</td>\n",
837
  " </tr>\n",
838
  " <tr>\n",
839
  " <td>7</td>\n",
840
+ " <td>1.745600</td>\n",
841
  " </tr>\n",
842
  " <tr>\n",
843
  " <td>8</td>\n",
844
+ " <td>1.753100</td>\n",
845
  " </tr>\n",
846
  " <tr>\n",
847
  " <td>9</td>\n",
848
+ " <td>1.708400</td>\n",
849
  " </tr>\n",
850
  " <tr>\n",
851
  " <td>10</td>\n",
852
+ " <td>1.801600</td>\n",
853
  " </tr>\n",
854
  " <tr>\n",
855
  " <td>11</td>\n",
856
+ " <td>1.658200</td>\n",
857
  " </tr>\n",
858
  " <tr>\n",
859
  " <td>12</td>\n",
860
+ " <td>1.699800</td>\n",
861
  " </tr>\n",
862
  " <tr>\n",
863
  " <td>13</td>\n",
864
+ " <td>1.612700</td>\n",
865
  " </tr>\n",
866
  " <tr>\n",
867
  " <td>14</td>\n",
868
+ " <td>1.674300</td>\n",
869
  " </tr>\n",
870
  " <tr>\n",
871
  " <td>15</td>\n",
872
+ " <td>1.728900</td>\n",
873
  " </tr>\n",
874
  " <tr>\n",
875
  " <td>16</td>\n",
876
+ " <td>1.562500</td>\n",
877
  " </tr>\n",
878
  " <tr>\n",
879
  " <td>17</td>\n",
880
+ " <td>1.623100</td>\n",
881
  " </tr>\n",
882
  " <tr>\n",
883
  " <td>18</td>\n",
884
+ " <td>1.631200</td>\n",
885
  " </tr>\n",
886
  " <tr>\n",
887
  " <td>19</td>\n",
888
+ " <td>1.491700</td>\n",
889
  " </tr>\n",
890
  " <tr>\n",
891
  " <td>20</td>\n",
892
+ " <td>1.740600</td>\n",
893
  " </tr>\n",
894
  " <tr>\n",
895
  " <td>21</td>\n",
896
+ " <td>1.562900</td>\n",
897
  " </tr>\n",
898
  " <tr>\n",
899
  " <td>22</td>\n",
900
+ " <td>1.419600</td>\n",
901
  " </tr>\n",
902
  " <tr>\n",
903
  " <td>23</td>\n",
904
+ " <td>1.416500</td>\n",
905
  " </tr>\n",
906
  " <tr>\n",
907
  " <td>24</td>\n",
908
+ " <td>1.392800</td>\n",
909
  " </tr>\n",
910
  " <tr>\n",
911
  " <td>25</td>\n",
912
+ " <td>1.504100</td>\n",
913
  " </tr>\n",
914
  " <tr>\n",
915
  " <td>26</td>\n",
916
+ " <td>1.401300</td>\n",
917
  " </tr>\n",
918
  " <tr>\n",
919
  " <td>27</td>\n",
920
+ " <td>1.289800</td>\n",
921
  " </tr>\n",
922
  " <tr>\n",
923
  " <td>28</td>\n",
924
+ " <td>1.308900</td>\n",
925
  " </tr>\n",
926
  " <tr>\n",
927
  " <td>29</td>\n",
928
+ " <td>1.227500</td>\n",
929
  " </tr>\n",
930
  " <tr>\n",
931
  " <td>30</td>\n",
932
+ " <td>1.207500</td>\n",
933
  " </tr>\n",
934
  " <tr>\n",
935
  " <td>31</td>\n",
936
+ " <td>1.213000</td>\n",
937
  " </tr>\n",
938
  " <tr>\n",
939
  " <td>32</td>\n",
940
+ " <td>1.197700</td>\n",
941
  " </tr>\n",
942
  " <tr>\n",
943
  " <td>33</td>\n",
944
+ " <td>1.295900</td>\n",
945
  " </tr>\n",
946
  " <tr>\n",
947
  " <td>34</td>\n",
948
+ " <td>1.233800</td>\n",
949
  " </tr>\n",
950
  " <tr>\n",
951
  " <td>35</td>\n",
952
+ " <td>1.173900</td>\n",
953
  " </tr>\n",
954
  " <tr>\n",
955
  " <td>36</td>\n",
956
+ " <td>0.930500</td>\n",
957
  " </tr>\n",
958
  " <tr>\n",
959
  " <td>37</td>\n",
960
+ " <td>0.904000</td>\n",
961
  " </tr>\n",
962
  " <tr>\n",
963
  " <td>38</td>\n",
964
+ " <td>1.203600</td>\n",
965
  " </tr>\n",
966
  " <tr>\n",
967
  " <td>39</td>\n",
968
+ " <td>1.074700</td>\n",
969
  " </tr>\n",
970
  " <tr>\n",
971
  " <td>40</td>\n",
972
+ " <td>1.069200</td>\n",
973
  " </tr>\n",
974
  " <tr>\n",
975
  " <td>41</td>\n",
976
+ " <td>1.098100</td>\n",
977
  " </tr>\n",
978
  " <tr>\n",
979
  " <td>42</td>\n",
980
+ " <td>1.039800</td>\n",
981
  " </tr>\n",
982
  " <tr>\n",
983
  " <td>43</td>\n",
984
+ " <td>0.931900</td>\n",
985
  " </tr>\n",
986
  " <tr>\n",
987
  " <td>44</td>\n",
988
+ " <td>0.959200</td>\n",
989
  " </tr>\n",
990
  " <tr>\n",
991
  " <td>45</td>\n",
992
+ " <td>1.007500</td>\n",
993
  " </tr>\n",
994
  " <tr>\n",
995
  " <td>46</td>\n",
996
+ " <td>1.028000</td>\n",
997
  " </tr>\n",
998
  " <tr>\n",
999
  " <td>47</td>\n",
1000
+ " <td>1.118600</td>\n",
1001
  " </tr>\n",
1002
  " <tr>\n",
1003
  " <td>48</td>\n",
1004
+ " <td>1.013400</td>\n",
1005
  " </tr>\n",
1006
  " <tr>\n",
1007
  " <td>49</td>\n",
1008
+ " <td>0.916100</td>\n",
1009
  " </tr>\n",
1010
  " <tr>\n",
1011
  " <td>50</td>\n",
1012
+ " <td>0.968300</td>\n",
1013
  " </tr>\n",
1014
  " <tr>\n",
1015
  " <td>51</td>\n",
1016
+ " <td>0.885600</td>\n",
1017
  " </tr>\n",
1018
  " <tr>\n",
1019
  " <td>52</td>\n",
1020
+ " <td>0.954300</td>\n",
1021
  " </tr>\n",
1022
  " <tr>\n",
1023
  " <td>53</td>\n",
1024
+ " <td>0.945000</td>\n",
1025
  " </tr>\n",
1026
  " <tr>\n",
1027
  " <td>54</td>\n",
1028
+ " <td>0.984000</td>\n",
1029
  " </tr>\n",
1030
  " <tr>\n",
1031
  " <td>55</td>\n",
1032
+ " <td>0.887000</td>\n",
1033
  " </tr>\n",
1034
  " <tr>\n",
1035
  " <td>56</td>\n",
1036
+ " <td>0.871100</td>\n",
1037
  " </tr>\n",
1038
  " <tr>\n",
1039
  " <td>57</td>\n",
1040
+ " <td>0.909800</td>\n",
1041
  " </tr>\n",
1042
  " <tr>\n",
1043
  " <td>58</td>\n",
1044
+ " <td>0.801200</td>\n",
1045
  " </tr>\n",
1046
  " <tr>\n",
1047
  " <td>59</td>\n",
1048
+ " <td>0.842400</td>\n",
1049
  " </tr>\n",
1050
  " <tr>\n",
1051
  " <td>60</td>\n",
1052
+ " <td>0.827100</td>\n",
1053
+ " </tr>\n",
1054
+ " <tr>\n",
1055
+ " <td>61</td>\n",
1056
+ " <td>0.858800</td>\n",
1057
+ " </tr>\n",
1058
+ " <tr>\n",
1059
+ " <td>62</td>\n",
1060
+ " <td>0.987800</td>\n",
1061
+ " </tr>\n",
1062
+ " <tr>\n",
1063
+ " <td>63</td>\n",
1064
+ " <td>0.831200</td>\n",
1065
+ " </tr>\n",
1066
+ " <tr>\n",
1067
+ " <td>64</td>\n",
1068
+ " <td>0.786000</td>\n",
1069
+ " </tr>\n",
1070
+ " <tr>\n",
1071
+ " <td>65</td>\n",
1072
+ " <td>0.711400</td>\n",
1073
+ " </tr>\n",
1074
+ " <tr>\n",
1075
+ " <td>66</td>\n",
1076
+ " <td>0.836100</td>\n",
1077
+ " </tr>\n",
1078
+ " <tr>\n",
1079
+ " <td>67</td>\n",
1080
+ " <td>0.862600</td>\n",
1081
+ " </tr>\n",
1082
+ " <tr>\n",
1083
+ " <td>68</td>\n",
1084
+ " <td>0.917900</td>\n",
1085
+ " </tr>\n",
1086
+ " <tr>\n",
1087
+ " <td>69</td>\n",
1088
+ " <td>0.910800</td>\n",
1089
+ " </tr>\n",
1090
+ " <tr>\n",
1091
+ " <td>70</td>\n",
1092
+ " <td>0.617600</td>\n",
1093
+ " </tr>\n",
1094
+ " <tr>\n",
1095
+ " <td>71</td>\n",
1096
+ " <td>0.499200</td>\n",
1097
+ " </tr>\n",
1098
+ " <tr>\n",
1099
+ " <td>72</td>\n",
1100
+ " <td>0.924200</td>\n",
1101
+ " </tr>\n",
1102
+ " <tr>\n",
1103
+ " <td>73</td>\n",
1104
+ " <td>0.892500</td>\n",
1105
+ " </tr>\n",
1106
+ " <tr>\n",
1107
+ " <td>74</td>\n",
1108
+ " <td>0.748600</td>\n",
1109
+ " </tr>\n",
1110
+ " <tr>\n",
1111
+ " <td>75</td>\n",
1112
+ " <td>1.002700</td>\n",
1113
+ " </tr>\n",
1114
+ " <tr>\n",
1115
+ " <td>76</td>\n",
1116
+ " <td>0.751900</td>\n",
1117
+ " </tr>\n",
1118
+ " <tr>\n",
1119
+ " <td>77</td>\n",
1120
+ " <td>0.854900</td>\n",
1121
+ " </tr>\n",
1122
+ " <tr>\n",
1123
+ " <td>78</td>\n",
1124
+ " <td>0.642900</td>\n",
1125
+ " </tr>\n",
1126
+ " <tr>\n",
1127
+ " <td>79</td>\n",
1128
+ " <td>0.670400</td>\n",
1129
+ " </tr>\n",
1130
+ " <tr>\n",
1131
+ " <td>80</td>\n",
1132
+ " <td>0.656400</td>\n",
1133
+ " </tr>\n",
1134
+ " <tr>\n",
1135
+ " <td>81</td>\n",
1136
+ " <td>0.820400</td>\n",
1137
+ " </tr>\n",
1138
+ " <tr>\n",
1139
+ " <td>82</td>\n",
1140
+ " <td>0.891800</td>\n",
1141
+ " </tr>\n",
1142
+ " <tr>\n",
1143
+ " <td>83</td>\n",
1144
+ " <td>0.723100</td>\n",
1145
+ " </tr>\n",
1146
+ " <tr>\n",
1147
+ " <td>84</td>\n",
1148
+ " <td>0.812400</td>\n",
1149
+ " </tr>\n",
1150
+ " <tr>\n",
1151
+ " <td>85</td>\n",
1152
+ " <td>1.006500</td>\n",
1153
+ " </tr>\n",
1154
+ " <tr>\n",
1155
+ " <td>86</td>\n",
1156
+ " <td>0.568000</td>\n",
1157
+ " </tr>\n",
1158
+ " <tr>\n",
1159
+ " <td>87</td>\n",
1160
+ " <td>0.704300</td>\n",
1161
+ " </tr>\n",
1162
+ " <tr>\n",
1163
+ " <td>88</td>\n",
1164
+ " <td>0.623900</td>\n",
1165
+ " </tr>\n",
1166
+ " <tr>\n",
1167
+ " <td>89</td>\n",
1168
+ " <td>0.889100</td>\n",
1169
+ " </tr>\n",
1170
+ " <tr>\n",
1171
+ " <td>90</td>\n",
1172
+ " <td>0.559900</td>\n",
1173
+ " </tr>\n",
1174
+ " <tr>\n",
1175
+ " <td>91</td>\n",
1176
+ " <td>0.910700</td>\n",
1177
+ " </tr>\n",
1178
+ " <tr>\n",
1179
+ " <td>92</td>\n",
1180
+ " <td>0.800300</td>\n",
1181
+ " </tr>\n",
1182
+ " <tr>\n",
1183
+ " <td>93</td>\n",
1184
+ " <td>0.830000</td>\n",
1185
+ " </tr>\n",
1186
+ " <tr>\n",
1187
+ " <td>94</td>\n",
1188
+ " <td>0.669400</td>\n",
1189
+ " </tr>\n",
1190
+ " <tr>\n",
1191
+ " <td>95</td>\n",
1192
+ " <td>0.626700</td>\n",
1193
+ " </tr>\n",
1194
+ " <tr>\n",
1195
+ " <td>96</td>\n",
1196
+ " <td>0.817700</td>\n",
1197
+ " </tr>\n",
1198
+ " <tr>\n",
1199
+ " <td>97</td>\n",
1200
+ " <td>0.786500</td>\n",
1201
+ " </tr>\n",
1202
+ " <tr>\n",
1203
+ " <td>98</td>\n",
1204
+ " <td>0.687900</td>\n",
1205
+ " </tr>\n",
1206
+ " <tr>\n",
1207
+ " <td>99</td>\n",
1208
+ " <td>0.687300</td>\n",
1209
+ " </tr>\n",
1210
+ " <tr>\n",
1211
+ " <td>100</td>\n",
1212
+ " <td>0.954200</td>\n",
1213
+ " </tr>\n",
1214
+ " <tr>\n",
1215
+ " <td>101</td>\n",
1216
+ " <td>0.717600</td>\n",
1217
+ " </tr>\n",
1218
+ " <tr>\n",
1219
+ " <td>102</td>\n",
1220
+ " <td>0.629100</td>\n",
1221
+ " </tr>\n",
1222
+ " <tr>\n",
1223
+ " <td>103</td>\n",
1224
+ " <td>0.718600</td>\n",
1225
+ " </tr>\n",
1226
+ " <tr>\n",
1227
+ " <td>104</td>\n",
1228
+ " <td>0.662000</td>\n",
1229
+ " </tr>\n",
1230
+ " <tr>\n",
1231
+ " <td>105</td>\n",
1232
+ " <td>0.615200</td>\n",
1233
+ " </tr>\n",
1234
+ " <tr>\n",
1235
+ " <td>106</td>\n",
1236
+ " <td>0.675300</td>\n",
1237
+ " </tr>\n",
1238
+ " <tr>\n",
1239
+ " <td>107</td>\n",
1240
+ " <td>0.859100</td>\n",
1241
+ " </tr>\n",
1242
+ " <tr>\n",
1243
+ " <td>108</td>\n",
1244
+ " <td>0.832700</td>\n",
1245
+ " </tr>\n",
1246
+ " <tr>\n",
1247
+ " <td>109</td>\n",
1248
+ " <td>0.777900</td>\n",
1249
+ " </tr>\n",
1250
+ " <tr>\n",
1251
+ " <td>110</td>\n",
1252
+ " <td>0.589500</td>\n",
1253
+ " </tr>\n",
1254
+ " <tr>\n",
1255
+ " <td>111</td>\n",
1256
+ " <td>0.730100</td>\n",
1257
+ " </tr>\n",
1258
+ " <tr>\n",
1259
+ " <td>112</td>\n",
1260
+ " <td>0.715500</td>\n",
1261
+ " </tr>\n",
1262
+ " <tr>\n",
1263
+ " <td>113</td>\n",
1264
+ " <td>0.750200</td>\n",
1265
+ " </tr>\n",
1266
+ " <tr>\n",
1267
+ " <td>114</td>\n",
1268
+ " <td>0.675900</td>\n",
1269
+ " </tr>\n",
1270
+ " <tr>\n",
1271
+ " <td>115</td>\n",
1272
+ " <td>0.635700</td>\n",
1273
+ " </tr>\n",
1274
+ " <tr>\n",
1275
+ " <td>116</td>\n",
1276
+ " <td>0.650500</td>\n",
1277
+ " </tr>\n",
1278
+ " <tr>\n",
1279
+ " <td>117</td>\n",
1280
+ " <td>0.696400</td>\n",
1281
+ " </tr>\n",
1282
+ " <tr>\n",
1283
+ " <td>118</td>\n",
1284
+ " <td>0.871200</td>\n",
1285
+ " </tr>\n",
1286
+ " <tr>\n",
1287
+ " <td>119</td>\n",
1288
+ " <td>0.737000</td>\n",
1289
+ " </tr>\n",
1290
+ " <tr>\n",
1291
+ " <td>120</td>\n",
1292
+ " <td>0.576800</td>\n",
1293
  " </tr>\n",
1294
  " </tbody>\n",
1295
  "</table><p>"
 
1322
  "name": "stdout",
1323
  "output_type": "stream",
1324
  "text": [
1325
+ "45963.0635 seconds used for training.\n",
1326
+ "766.05 minutes used for training.\n",
1327
  "Peak reserved memory = 2.342 GB.\n",
1328
  "Peak reserved memory for training = 2.117 GB.\n",
1329
  "Peak reserved memory % of max memory = 59.442 %.\n",
 
1377
  {
1378
  "data": {
1379
  "text/plain": [
1380
+ "['<|im_start|>user\\nContinue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\\n<|im_start|>assistant\\nThe Fibonacci sequence is a series of numbers where each number is the sum of the previous two numbers, starting from 0 and 1. To find the Fibonacci sequence, we can start with the first two terms, 1 and 1, and then add the last two terms, 1 and 2']"
1381
  ]
1382
  },
1383
  "execution_count": 12,
 
1438
  "<|im_start|>user\n",
1439
  "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\n",
1440
  "<|im_start|>assistant\n",
1441
+ "To find the nth Fibonacci number, we can start with the first few terms of the Fibonacci sequence: 1, 1, 2, 3, 5, 8, 13, 21, 34, and so on.\n",
1442
+ "\n",
1443
+ "The Fibonacci sequence is defined recursively as:\n",
1444
  "\n",
1445
+ "F(0) = 0\n",
1446
+ "F(1) = 1\n",
1447
+ "F(2) = 1\n",
1448
+ "F(3) = 1\n",
1449
+ "F(4) = 2\n",
1450
+ "F(5) = 3\n",
1451
+ "F(6) = 5\n",
1452
+ "F\n"
1453
  ]
1454
  }
1455
  ],
 
1541
  "<|im_start|>user\n",
1542
  "What is a famous tall tower in Paris?<|im_end|>\n",
1543
  "<|im_start|>assistant\n",
1544
+ "The famous tall tower in Paris is the Arc de Triomphe, a 13th-century French landmark that was built in 1882. It is a 130-meter-tall Gothic spire that serves as a symbol of Paris and is considered one of the most recognizable landmarks in the world. The tower is located in the heart of the city, and its height is considered one of the most impressive in the world.\n",
1545
  "\n",
1546
+ "The Arc de Triomphe is a tribute to the French monarchy and is a symbol of the city's history and culture. It was built in 1882\n"
1547
  ]
1548
  }
1549
  ],