Update README.md
Browse files
README.md
CHANGED
@@ -30,8 +30,6 @@ This llama model was trained 2x faster with [Unsloth](https://github.com/unsloth
|
|
30 |
# Google Colab の場合は上記の環境構築手順を行なわず、単にこのセルから実行していってください。
|
31 |
!pip uninstall unsloth -y
|
32 |
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
|
33 |
-
!pip install --upgrade torch
|
34 |
-
!pip install --upgrade xformers
|
35 |
|
36 |
|
37 |
# Google Colab のデフォルトで入っているパッケージをアップグレード(Moriyasu さんありがとうございます)
|
@@ -126,36 +124,30 @@ dataset = load_dataset("json", data_files="/content/ichikara-instruction-003-001
|
|
126 |
|
127 |
from datasets import load_dataset
|
128 |
|
|
|
129 |
dataset = load_dataset("json", data_files="/content/aio_01_dev.jsonl")
|
130 |
|
131 |
|
132 |
-
#
|
|
|
|
|
|
|
133 |
print("Dataset columns:", dataset.column_names)
|
134 |
|
135 |
-
#
|
136 |
def formatting_prompts_func(examples):
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
return {"formatted_text": f"Processed: {input_text}"}
|
143 |
-
except KeyError:
|
144 |
-
print(f"Key error for examples: {examples}")
|
145 |
-
return {}
|
146 |
-
|
147 |
-
# フィルタリング(必要に応じて)
|
148 |
-
if "text" not in dataset.column_names:
|
149 |
-
print("Warning: 'text' column not found. Filtering dataset...")
|
150 |
-
dataset = dataset.filter(lambda example: "text" in example)
|
151 |
-
|
152 |
-
# map を適用
|
153 |
dataset = dataset.map(
|
154 |
formatting_prompts_func,
|
155 |
-
num_proc=4 #
|
|
|
156 |
)
|
157 |
|
158 |
-
#
|
159 |
print(dataset)
|
160 |
|
161 |
|
@@ -267,7 +259,7 @@ trainer_stats = trainer.train()
|
|
267 |
# omnicampusの開発環境では、左にタスクのjsonlをドラッグアンドドロップしてから実行。
|
268 |
import json
|
269 |
datasets = []
|
270 |
-
with open("
|
271 |
item = ""
|
272 |
for line in f:
|
273 |
line = line.strip()
|
|
|
30 |
# Google Colab の場合は上記の環境構築手順を行なわず、単にこのセルから実行していってください。
|
31 |
!pip uninstall unsloth -y
|
32 |
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
|
|
|
|
|
33 |
|
34 |
|
35 |
# Google Colab のデフォルトで入っているパッケージをアップグレード(Moriyasu さんありがとうございます)
|
|
|
124 |
|
125 |
from datasets import load_dataset
|
126 |
|
127 |
+
# Load the dataset
|
128 |
dataset = load_dataset("json", data_files="/content/aio_01_dev.jsonl")
|
129 |
|
130 |
|
131 |
+
# Print dataset info to inspect its structure
|
132 |
+
print(dataset)
|
133 |
+
|
134 |
+
# Check column names
|
135 |
print("Dataset columns:", dataset.column_names)
|
136 |
|
137 |
+
# Define the formatting function
|
138 |
def formatting_prompts_func(examples):
|
139 |
+
# Access 'summary' instead of dynamically finding the key
|
140 |
+
input_text = examples["original_question"]
|
141 |
+
return {"formatted_text": f"Processed: {input_text}"}
|
142 |
+
|
143 |
+
# Apply map function with formatting
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
dataset = dataset.map(
|
145 |
formatting_prompts_func,
|
146 |
+
num_proc=4, # Parallel processing
|
147 |
+
remove_columns=["original_question"] # remove original summary column
|
148 |
)
|
149 |
|
150 |
+
# Print the processed dataset
|
151 |
print(dataset)
|
152 |
|
153 |
|
|
|
259 |
# omnicampusの開発環境では、左にタスクのjsonlをドラッグアンドドロップしてから実行。
|
260 |
import json
|
261 |
datasets = []
|
262 |
+
with open("./elyza-tasks-100-TV_0.jsonl", "r") as f:
|
263 |
item = ""
|
264 |
for line in f:
|
265 |
line = line.strip()
|