Upload folder using huggingface_hub
Browse files- .code-workspace +8 -0
- .gitignore +7 -0
- .vscode/settings.json +5 -0
- README.md +13 -3
- configs/train_config.yaml +10 -0
- data/external +0 -0
- data/processed +0 -0
- data/raw +0 -0
- demo_app/app.py +17 -0
- docs/setup_guide.md +23 -0
- docs/usage.md +13 -0
- environment.local.yml +11 -0
- notebooks/exploratory +0 -0
- notebooks/inference +0 -0
- notebooks/modeling +0 -0
- requirements.txt +6 -0
- src/cloud_deploy/deploy.py +5 -0
- src/cloud_deploy/deploy_aws.py +6 -0
- src/eval/evaluate.py +18 -0
- src/model/train.py +53 -0
- src/orchestration/langchain_pipeline.py +25 -0
- src/orchestration/workflow.py +12 -0
- src/preprocessing/preprocess_data.py +19 -0
- src/utils/logger.py +16 -0
- src/utils/utils.py +16 -0
- tests/test_basic.py +11 -0
- tests/test_preprocessing.py +12 -0
.code-workspace
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"folders": [
|
3 |
+
{ "path": "C:/Users/hp/Desktop/llm-finetuning-cloud-orchestration" }
|
4 |
+
],
|
5 |
+
"settings": {
|
6 |
+
"python.pythonPath": "env/bin/python"
|
7 |
+
}
|
8 |
+
}
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
.env
|
4 |
+
env/
|
5 |
+
logs/
|
6 |
+
models/
|
7 |
+
results/
|
.vscode/settings.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cSpell.words": [
|
3 |
+
"langchain"
|
4 |
+
]
|
5 |
+
}
|
README.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Finetuning an Open-Source LLM
|
2 |
+
|
3 |
+
This project adapts large language models to domain-specific tasks, leveraging parameter-efficient techniques (LoRA/QLoRA), cloud deployment, and workflow orchestration.
|
4 |
+
|
5 |
+
## Getting Started
|
6 |
+
- Clone this repo
|
7 |
+
- Install Python dependencies
|
8 |
+
- See `demo_app/app.py` to launch the demo
|
9 |
+
|
10 |
+
## Structure
|
11 |
+
- Models: Fine-tuned checkpoints
|
12 |
+
- Demo App: Streamlit/Gradio interface
|
13 |
+
- Configs: Training/deployment configs
|
configs/train_config.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_name: "llama-base"
|
2 |
+
learning_rate: 5e-5
|
3 |
+
batch_size: 16
|
4 |
+
num_epochs: 3
|
5 |
+
lora:
|
6 |
+
rank: 8
|
7 |
+
alpha: 16
|
8 |
+
dropout: 0.1
|
9 |
+
dataset_path: "data/processed/train.json"
|
10 |
+
output_dir: "models/llm-finetuned"
|
data/external
ADDED
File without changes
|
data/processed
ADDED
File without changes
|
data/raw
ADDED
File without changes
|
demo_app/app.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
st.title("Fine-tuned LLM Demo")
|
5 |
+
|
6 |
+
# Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
|
7 |
+
generator = pipeline('text-generation', model='gpt2')
|
8 |
+
|
9 |
+
prompt = st.text_input("Enter prompt:")
|
10 |
+
|
11 |
+
# Add a slider to select number of responses
|
12 |
+
num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
|
13 |
+
|
14 |
+
if prompt:
|
15 |
+
results = generator(prompt, max_length=100, num_return_sequences=num_responses)
|
16 |
+
for i, result in enumerate(results):
|
17 |
+
st.write(f"Output {i+1}: {result['generated_text']}")
|
docs/setup_guide.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Project Setup Guide
|
2 |
+
|
3 |
+
## Installation
|
4 |
+
|
5 |
+
1. Create Python environment:
|
6 |
+
python -m venv env
|
7 |
+
source env/bin/activate # Linux/macOS
|
8 |
+
.\env\Scripts\activate # Windows
|
9 |
+
|
10 |
+
2. Install dependencies:
|
11 |
+
pip install -r requirements.txt
|
12 |
+
|
13 |
+
3. Run training example:
|
14 |
+
python src/model/train.py
|
15 |
+
|
16 |
+
---
|
17 |
+
|
18 |
+
## Usage
|
19 |
+
|
20 |
+
- Run demo app:
|
21 |
+
streamlit run demo_app/app.py
|
22 |
+
|
23 |
+
undefined
|
docs/usage.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Project Usage
|
2 |
+
|
3 |
+
## Setting Up
|
4 |
+
1. Install dependencies: `pip install -r requirements.txt`
|
5 |
+
2. Prepare dataset in `data/processed/`
|
6 |
+
3. Run training: `python src/model/train.py --config configs/train_config.yaml`
|
7 |
+
4. Launch demo app: `streamlit run demo_app/app.py`
|
8 |
+
|
9 |
+
## Project Structure
|
10 |
+
- `models/`: Fine-tuned checkpoints
|
11 |
+
- `demo_app/`: Demo frontend interface
|
12 |
+
- `configs/`: Configs and hyperparameters
|
13 |
+
- `tests/`: Unit tests
|
environment.local.yml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: llm-finetune-env
|
2 |
+
channels:
|
3 |
+
- defaults
|
4 |
+
- conda-forge
|
5 |
+
dependencies:
|
6 |
+
- python=3.10
|
7 |
+
- pytorch
|
8 |
+
- transformers
|
9 |
+
- streamlit
|
10 |
+
- gradio
|
11 |
+
- langchain
|
notebooks/exploratory
ADDED
File without changes
|
notebooks/inference
ADDED
File without changes
|
notebooks/modeling
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
torch
|
3 |
+
datasets
|
4 |
+
streamlit
|
5 |
+
gradio
|
6 |
+
langchain
|
src/cloud_deploy/deploy.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def deploy_to_aws():
|
2 |
+
print("Placeholder: Add AWS deployment code here")
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
deploy_to_aws()
|
src/cloud_deploy/deploy_aws.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def deploy_model_aws():
|
2 |
+
# Placeholder for AWS deployment logic using boto3, AWS SDK, or AWS CLI commands
|
3 |
+
print("Deploying model to AWS... (implement deployment logic here)")
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
deploy_model_aws()
|
src/eval/evaluate.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
2 |
+
import torch
|
3 |
+
|
4 |
+
def evaluate_model(model_path, test_sentences):
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
6 |
+
model = AutoModelForCausalLM.from_pretrained(model_path)
|
7 |
+
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
8 |
+
|
9 |
+
for sentence in test_sentences:
|
10 |
+
output = generator(sentence, max_length=50, num_return_sequences=1)
|
11 |
+
print(f"Input: {sentence}\nOutput: {output[0]['generated_text']}\n")
|
12 |
+
|
13 |
+
if __name__ == "__main__":
|
14 |
+
test_samples = [
|
15 |
+
"How does fine-tuning work?",
|
16 |
+
"Explain parameter-efficient methods like LoRA."
|
17 |
+
]
|
18 |
+
evaluate_model("models/llm-finetuned", test_samples)
|
src/model/train.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
|
3 |
+
from datasets import load_dataset
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
|
7 |
+
def load_config(config_path):
|
8 |
+
with open(config_path, "r") as f:
|
9 |
+
return json.load(f)
|
10 |
+
|
11 |
+
def main():
|
12 |
+
config = load_config("configs/train_config.yaml")
|
13 |
+
|
14 |
+
model_name = config["model_name"]
|
15 |
+
dataset_path = config["dataset_path"]
|
16 |
+
output_dir = config["output_dir"]
|
17 |
+
learning_rate = config["learning_rate"]
|
18 |
+
batch_size = config["batch_size"]
|
19 |
+
num_epochs = config["num_epochs"]
|
20 |
+
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
23 |
+
|
24 |
+
# Example dataset loading; replace with your data loading pipeline
|
25 |
+
dataset = load_dataset("json", data_files={"train": dataset_path})
|
26 |
+
def tokenize_function(examples):
|
27 |
+
return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)
|
28 |
+
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
29 |
+
|
30 |
+
training_args = TrainingArguments(
|
31 |
+
output_dir=output_dir,
|
32 |
+
overwrite_output_dir=True,
|
33 |
+
evaluation_strategy="epoch",
|
34 |
+
learning_rate=learning_rate,
|
35 |
+
per_device_train_batch_size=batch_size,
|
36 |
+
num_train_epochs=num_epochs,
|
37 |
+
save_strategy="epoch",
|
38 |
+
logging_dir='./logs',
|
39 |
+
logging_steps=10,
|
40 |
+
)
|
41 |
+
|
42 |
+
trainer = Trainer(
|
43 |
+
model=model,
|
44 |
+
args=training_args,
|
45 |
+
train_dataset=tokenized_dataset["train"],
|
46 |
+
tokenizer=tokenizer,
|
47 |
+
)
|
48 |
+
|
49 |
+
trainer.train()
|
50 |
+
trainer.save_model(output_dir)
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
main()
|
src/orchestration/langchain_pipeline.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.llms import HuggingFacePipeline
|
2 |
+
from langchain.chains import SequentialChain
|
3 |
+
|
4 |
+
# Load your fine-tuned HuggingFace model as a pipeline
|
5 |
+
llm = HuggingFacePipeline.from_model_id(
|
6 |
+
model_id="your-model-name-or-path", # e.g. "yourusername/your-finetuned-model"
|
7 |
+
task="text-generation"
|
8 |
+
)
|
9 |
+
|
10 |
+
# Define functions for each step (customize as needed)
|
11 |
+
def step_1(input_text):
|
12 |
+
# Example step: preprocess input
|
13 |
+
return input_text.strip()
|
14 |
+
|
15 |
+
def step_2(processed_input):
|
16 |
+
# Example step: pass through LLM
|
17 |
+
return llm(processed_input)
|
18 |
+
|
19 |
+
def step_3(llm_output):
|
20 |
+
# Example step: post-process output
|
21 |
+
return llm_output.upper() # just an example
|
22 |
+
|
23 |
+
# Create a sequential chain from your steps
|
24 |
+
steps = [step_1, step_2, step_3]
|
25 |
+
chain = SequentialChain(chains=steps)
|
src/orchestration/workflow.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import PromptTemplate, LLMChain
|
2 |
+
from langchain.llms import HuggingFaceHub
|
3 |
+
|
4 |
+
def run_workflow(prompt):
|
5 |
+
llm = HuggingFaceHub(repo_id="facebook/opt-350m", model_kwargs={"temperature": 0.7})
|
6 |
+
template = PromptTemplate(input_variables=["topic"], template="Explain {topic} in simple terms.")
|
7 |
+
chain = LLMChain(prompt_template=template, llm=llm)
|
8 |
+
response = chain.run(topic=prompt)
|
9 |
+
print(response)
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
run_workflow("parameter-efficient fine-tuning (LoRA)")
|
src/preprocessing/preprocess_data.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
|
3 |
+
def preprocess_data(data, model_name="facebook/opt-350m"):
|
4 |
+
"""
|
5 |
+
Tokenizes input text data using chosen tokenizer.
|
6 |
+
Args:
|
7 |
+
data (list of str): Input sentences.
|
8 |
+
model_name (str): Pretrained model tokenizer.
|
9 |
+
Returns:
|
10 |
+
tokenized dataset (dict): Dictionary of tokenized inputs.
|
11 |
+
"""
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
13 |
+
tokenized = tokenizer(data, padding=True, truncation=True, return_tensors="pt")
|
14 |
+
return tokenized
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
sample = ["Fine-tuning an open-source LLM.", "This is a sample sentence."]
|
18 |
+
tokenized_sample = preprocess_data(sample)
|
19 |
+
print(tokenized_sample)
|
src/utils/logger.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
def setup_logger(name="train_log", log_file="logs/training.log"):
|
5 |
+
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
6 |
+
logger = logging.getLogger(name)
|
7 |
+
logger.setLevel(logging.INFO)
|
8 |
+
fh = logging.FileHandler(log_file)
|
9 |
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
10 |
+
fh.setFormatter(formatter)
|
11 |
+
logger.addHandler(fh)
|
12 |
+
return logger
|
13 |
+
|
14 |
+
if __name__ == "__main__":
|
15 |
+
log = setup_logger()
|
16 |
+
log.info("Logger initialized.")
|
src/utils/utils.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
|
4 |
+
def setup_logging(log_dir="logs", log_file="training.log"):
|
5 |
+
os.makedirs(log_dir, exist_ok=True)
|
6 |
+
log_path = os.path.join(log_dir, log_file)
|
7 |
+
logging.basicConfig(
|
8 |
+
level=logging.INFO,
|
9 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
10 |
+
handlers=[logging.FileHandler(log_path), logging.StreamHandler()]
|
11 |
+
)
|
12 |
+
return logging.getLogger()
|
13 |
+
|
14 |
+
if __name__ == "__main__":
|
15 |
+
logger = setup_logging()
|
16 |
+
logger.info("Logger is set up and ready.")
|
tests/test_basic.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from src.preprocessing.preprocess_data import preprocess_data
|
3 |
+
|
4 |
+
class TestPreprocessing(unittest.TestCase):
|
5 |
+
def test_tokenize(self):
|
6 |
+
sample = ["This is a test."]
|
7 |
+
result = preprocess_data(sample)
|
8 |
+
self.assertIn("input_ids", result)
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
unittest.main()
|
tests/test_preprocessing.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from src.preprocessing import preprocess_data
|
3 |
+
|
4 |
+
class TestPreprocessing(unittest.TestCase):
|
5 |
+
def test_sample(self):
|
6 |
+
sample_data = ["Hello World!"]
|
7 |
+
processed = preprocess_data(sample_data)
|
8 |
+
self.assertIsInstance(processed, list)
|
9 |
+
self.assertTrue(all(isinstance(s, str) for s in processed))
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
unittest.main()
|