Mehdi Challakh
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,198 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
metrics:
|
6 |
+
- bleu
|
7 |
+
- rouge
|
8 |
+
tags:
|
9 |
+
- causal-lm
|
10 |
+
- code
|
11 |
+
- cypher
|
12 |
+
- graph
|
13 |
+
- neo4j
|
14 |
+
---
|
15 |
+
|
16 |
+
## Model Description
|
17 |
+
|
18 |
+
A finetune of https://huggingface.co/stabilityai/stable-code-instruct-3b trained on https://github.com/neo4j-labs/text2cypher/tree/main/datasets/synthetic_opus_demodbs to generate CYPHER statements for GraphDB queries such as neo4j.
|
19 |
+
|
20 |
+
## Usage
|
21 |
+
|
22 |
+
### Safetensors (recommended)
|
23 |
+
|
24 |
+
```python
|
25 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
26 |
+
import torch
|
27 |
+
|
28 |
+
# Load the model and tokenizer
|
29 |
+
print("Loading model...")
|
30 |
+
model_name = "path/to/your/safetensors/model" #./stable-cypher-instruct-3b
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
32 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
|
33 |
+
|
34 |
+
# Define your question
|
35 |
+
'''instruction is VERY IMPORTANT the model was finetuned on this particular system prompt.
|
36 |
+
Except bad performance without it'''
|
37 |
+
instruction = "Create a Cypher statement to answer the following question:"
|
38 |
+
question = "List the first 3 articles mentioning organizations with a revenue less than 5 million."
|
39 |
+
|
40 |
+
# Create the full prompt
|
41 |
+
full_prompt = f"{instruction}\n\nHuman: {question}\n\nAssistant:"
|
42 |
+
|
43 |
+
# Tokenize input
|
44 |
+
inputs = tokenizer(full_prompt, return_tensors="pt")
|
45 |
+
|
46 |
+
# Generate response
|
47 |
+
print("Generating response...")
|
48 |
+
with torch.no_grad():
|
49 |
+
outputs = model.generate(
|
50 |
+
**inputs,
|
51 |
+
max_new_tokens=128,
|
52 |
+
do_sample=True,
|
53 |
+
top_p=0.9,
|
54 |
+
temperature=0.2,
|
55 |
+
pad_token_id=tokenizer.eos_token_id,
|
56 |
+
)
|
57 |
+
|
58 |
+
# Decode and print the generated response
|
59 |
+
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
60 |
+
answer = answer[len(full_prompt):].strip() # Remove the input prompt from the output
|
61 |
+
|
62 |
+
print("\nQuestion:", question)
|
63 |
+
print("\nGenerated Cypher statement:")
|
64 |
+
print(answer)
|
65 |
+
```
|
66 |
+
|
67 |
+
### GGUF
|
68 |
+
|
69 |
+
```python
|
70 |
+
from llama_cpp import Llama
|
71 |
+
|
72 |
+
# Load the GGUF model
|
73 |
+
print("Loading model...")
|
74 |
+
model = Llama(
|
75 |
+
model_path="stable-cypher-instruct-3b.Q4_K_M.gguf",
|
76 |
+
n_ctx=512,
|
77 |
+
n_batch=512,
|
78 |
+
n_gpu_layers=-1, # Use all available GPU layers
|
79 |
+
max_tokens=128,
|
80 |
+
top_p=0.9,
|
81 |
+
temperature=0.2,
|
82 |
+
verbose=False
|
83 |
+
)
|
84 |
+
|
85 |
+
# Define your question
|
86 |
+
'''instruction is VERY IMPORTANT the model was finetuned on this particular system prompt.
|
87 |
+
Except bad performance without it'''
|
88 |
+
instruction = "Create a Cypher statement to answer the following question:"
|
89 |
+
question = "List the first 3 articles mentioning organizations with a revenue less than 5 million."
|
90 |
+
|
91 |
+
# Create the full prompt
|
92 |
+
full_prompt = f"{instruction}\n\nHuman: {question}\n\nAssistant:"
|
93 |
+
|
94 |
+
# Generate response
|
95 |
+
print("Generating response...")
|
96 |
+
response = model(
|
97 |
+
full_prompt,
|
98 |
+
max_tokens=128,
|
99 |
+
stop=["Human:", "\n\n"],
|
100 |
+
echo=False
|
101 |
+
)
|
102 |
+
|
103 |
+
# Extract and print the generated response
|
104 |
+
answer = response['choices'][0]['text'].strip()
|
105 |
+
print("\nQuestion:", question)
|
106 |
+
print("\nGenerated Cypher statement:")
|
107 |
+
print(answer)
|
108 |
+
```
|
109 |
+
|
110 |
+
## Performance
|
111 |
+
|
112 |
+
|
113 |
+
| Metric | stable-code-instruct-3b | stable-cypher-instruct-3b |
|
114 |
+
| --------- | ------------------------- | --------------------------- |
|
115 |
+
| BLEU-4 | 19.07 | 88.63 |
|
116 |
+
| ROUGE-1 | 39.49 | 95.09 |
|
117 |
+
| ROUGE-2 | 24.82 | 90.71 |
|
118 |
+
| ROUGE-L | 29.63 | 91.51 |
|
119 |
+
|
120 |
+
### Example
|
121 |
+
|
122 |
+
#### Stable Cypher
|
123 |
+

|
124 |
+
|
125 |
+
#### Stable Code
|
126 |
+

|
127 |
+
|
128 |
+
### Eval params
|
129 |
+

|
130 |
+
|
131 |
+
## Reproducability
|
132 |
+
|
133 |
+
This is the config file from Llama Factory :
|
134 |
+
|
135 |
+
```json
|
136 |
+
{
|
137 |
+
"top.model_name": "Custom",
|
138 |
+
"top.finetuning_type": "lora",
|
139 |
+
"top.adapter_path": [],
|
140 |
+
"top.quantization_bit": "none",
|
141 |
+
"top.template": "default",
|
142 |
+
"top.rope_scaling": "none",
|
143 |
+
"top.booster": "none",
|
144 |
+
"train.training_stage": "Supervised Fine-Tuning",
|
145 |
+
"train.dataset_dir": "data",
|
146 |
+
"train.dataset": [
|
147 |
+
"cypher_opus"
|
148 |
+
],
|
149 |
+
"train.learning_rate": "2e-4",
|
150 |
+
"train.num_train_epochs": "5.0",
|
151 |
+
"train.max_grad_norm": "1.0",
|
152 |
+
"train.max_samples": "5000",
|
153 |
+
"train.compute_type": "fp16",
|
154 |
+
"train.cutoff_len": 256,
|
155 |
+
"train.batch_size": 16,
|
156 |
+
"train.gradient_accumulation_steps": 2,
|
157 |
+
"train.val_size": 0.1,
|
158 |
+
"train.lr_scheduler_type": "cosine",
|
159 |
+
"train.logging_steps": 10,
|
160 |
+
"train.save_steps": 100,
|
161 |
+
"train.warmup_steps": 20,
|
162 |
+
"train.neftune_alpha": 0,
|
163 |
+
"train.optim": "adamw_torch",
|
164 |
+
"train.resize_vocab": false,
|
165 |
+
"train.packing": false,
|
166 |
+
"train.upcast_layernorm": false,
|
167 |
+
"train.use_llama_pro": false,
|
168 |
+
"train.shift_attn": false,
|
169 |
+
"train.report_to": false,
|
170 |
+
"train.num_layer_trainable": 3,
|
171 |
+
"train.name_module_trainable": "all",
|
172 |
+
"train.lora_rank": 64,
|
173 |
+
"train.lora_alpha": 64,
|
174 |
+
"train.lora_dropout": 0.1,
|
175 |
+
"train.loraplus_lr_ratio": 0,
|
176 |
+
"train.create_new_adapter": false,
|
177 |
+
"train.use_rslora": false,
|
178 |
+
"train.use_dora": true,
|
179 |
+
"train.lora_target": "",
|
180 |
+
"train.additional_target": "",
|
181 |
+
"train.dpo_beta": 0.1,
|
182 |
+
"train.dpo_ftx": 0,
|
183 |
+
"train.orpo_beta": 0.1,
|
184 |
+
"train.reward_model": null,
|
185 |
+
"train.use_galore": false,
|
186 |
+
"train.galore_rank": 16,
|
187 |
+
"train.galore_update_interval": 200,
|
188 |
+
"train.galore_scale": 0.25,
|
189 |
+
"train.galore_target": "all"
|
190 |
+
}
|
191 |
+
```
|
192 |
+
|
193 |
+
I used llama.cpp to merge the LoRa and generate the quants.
|
194 |
+
|
195 |
+
The progress achieved from the base model is significant but this is just a first draft.
|
196 |
+
I ran a few batches of training tickering with some of the values but was far from being exhaustive and thourough.
|
197 |
+
Main concern is the capability of the model to expand on unseen fields and syntax.
|
198 |
+
I'm open to the idea of making a v2 which (should) be production ready and a full tutorial if there is enough interest in this project.
|