promptsbyesha commited on
Commit
72f4d4d
·
verified ·
1 Parent(s): ab4aa96

Upload folder using huggingface_hub

Browse files
.code-workspace ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "folders": [
3
+ { "path": "C:/Users/hp/Desktop/llm-finetuning-cloud-orchestration" }
4
+ ],
5
+ "settings": {
6
+ "python.pythonPath": "env/bin/python"
7
+ }
8
+ }
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ env/
5
+ logs/
6
+ models/
7
+ results/
.vscode/settings.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "cSpell.words": [
3
+ "langchain"
4
+ ]
5
+ }
README.md CHANGED
@@ -1,3 +1,13 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
1
+ # Finetuning an Open-Source LLM
2
+
3
+ This project adapts large language models to domain-specific tasks, leveraging parameter-efficient techniques (LoRA/QLoRA), cloud deployment, and workflow orchestration.
4
+
5
+ ## Getting Started
6
+ - Clone this repo
7
+ - Install Python dependencies
8
+ - See `demo_app/app.py` to launch the demo
9
+
10
+ ## Structure
11
+ - Models: Fine-tuned checkpoints
12
+ - Demo App: Streamlit/Gradio interface
13
+ - Configs: Training/deployment configs
configs/train_config.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: "llama-base"
2
+ learning_rate: 5e-5
3
+ batch_size: 16
4
+ num_epochs: 3
5
+ lora:
6
+ rank: 8
7
+ alpha: 16
8
+ dropout: 0.1
9
+ dataset_path: "data/processed/train.json"
10
+ output_dir: "models/llm-finetuned"
data/external ADDED
File without changes
data/processed ADDED
File without changes
data/raw ADDED
File without changes
demo_app/app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+
4
+ st.title("Fine-tuned LLM Demo")
5
+
6
+ # Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
7
+ generator = pipeline('text-generation', model='gpt2')
8
+
9
+ prompt = st.text_input("Enter prompt:")
10
+
11
+ # Add a slider to select number of responses
12
+ num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
13
+
14
+ if prompt:
15
+ results = generator(prompt, max_length=100, num_return_sequences=num_responses)
16
+ for i, result in enumerate(results):
17
+ st.write(f"Output {i+1}: {result['generated_text']}")
docs/setup_guide.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Setup Guide
2
+
3
+ ## Installation
4
+
5
+ 1. Create Python environment:
6
+ python -m venv env
7
+ source env/bin/activate # Linux/macOS
8
+ .\env\Scripts\activate # Windows
9
+
10
+ 2. Install dependencies:
11
+ pip install -r requirements.txt
12
+
13
+ 3. Run training example:
14
+ python src/model/train.py
15
+
16
+ ---
17
+
18
+ ## Usage
19
+
20
+ - Run demo app:
21
+ streamlit run demo_app/app.py
22
+
23
+ undefined
docs/usage.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Usage
2
+
3
+ ## Setting Up
4
+ 1. Install dependencies: `pip install -r requirements.txt`
5
+ 2. Prepare dataset in `data/processed/`
6
+ 3. Run training: `python src/model/train.py --config configs/train_config.yaml`
7
+ 4. Launch demo app: `streamlit run demo_app/app.py`
8
+
9
+ ## Project Structure
10
+ - `models/`: Fine-tuned checkpoints
11
+ - `demo_app/`: Demo frontend interface
12
+ - `configs/`: Configs and hyperparameters
13
+ - `tests/`: Unit tests
environment.local.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: llm-finetune-env
2
+ channels:
3
+ - defaults
4
+ - conda-forge
5
+ dependencies:
6
+ - python=3.10
7
+ - pytorch
8
+ - transformers
9
+ - streamlit
10
+ - gradio
11
+ - langchain
notebooks/exploratory ADDED
File without changes
notebooks/inference ADDED
File without changes
notebooks/modeling ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ datasets
4
+ streamlit
5
+ gradio
6
+ langchain
src/cloud_deploy/deploy.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ def deploy_to_aws():
2
+ print("Placeholder: Add AWS deployment code here")
3
+
4
+ if __name__ == "__main__":
5
+ deploy_to_aws()
src/cloud_deploy/deploy_aws.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def deploy_model_aws():
2
+ # Placeholder for AWS deployment logic using boto3, AWS SDK, or AWS CLI commands
3
+ print("Deploying model to AWS... (implement deployment logic here)")
4
+
5
+ if __name__ == "__main__":
6
+ deploy_model_aws()
src/eval/evaluate.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
2
+ import torch
3
+
4
+ def evaluate_model(model_path, test_sentences):
5
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
6
+ model = AutoModelForCausalLM.from_pretrained(model_path)
7
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
8
+
9
+ for sentence in test_sentences:
10
+ output = generator(sentence, max_length=50, num_return_sequences=1)
11
+ print(f"Input: {sentence}\nOutput: {output[0]['generated_text']}\n")
12
+
13
+ if __name__ == "__main__":
14
+ test_samples = [
15
+ "How does fine-tuning work?",
16
+ "Explain parameter-efficient methods like LoRA."
17
+ ]
18
+ evaluate_model("models/llm-finetuned", test_samples)
src/model/train.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
3
+ from datasets import load_dataset
4
+ import os
5
+ import json
6
+
7
+ def load_config(config_path):
8
+ with open(config_path, "r") as f:
9
+ return json.load(f)
10
+
11
+ def main():
12
+ config = load_config("configs/train_config.yaml")
13
+
14
+ model_name = config["model_name"]
15
+ dataset_path = config["dataset_path"]
16
+ output_dir = config["output_dir"]
17
+ learning_rate = config["learning_rate"]
18
+ batch_size = config["batch_size"]
19
+ num_epochs = config["num_epochs"]
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForCausalLM.from_pretrained(model_name)
23
+
24
+ # Example dataset loading; replace with your data loading pipeline
25
+ dataset = load_dataset("json", data_files={"train": dataset_path})
26
+ def tokenize_function(examples):
27
+ return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)
28
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
29
+
30
+ training_args = TrainingArguments(
31
+ output_dir=output_dir,
32
+ overwrite_output_dir=True,
33
+ evaluation_strategy="epoch",
34
+ learning_rate=learning_rate,
35
+ per_device_train_batch_size=batch_size,
36
+ num_train_epochs=num_epochs,
37
+ save_strategy="epoch",
38
+ logging_dir='./logs',
39
+ logging_steps=10,
40
+ )
41
+
42
+ trainer = Trainer(
43
+ model=model,
44
+ args=training_args,
45
+ train_dataset=tokenized_dataset["train"],
46
+ tokenizer=tokenizer,
47
+ )
48
+
49
+ trainer.train()
50
+ trainer.save_model(output_dir)
51
+
52
+ if __name__ == "__main__":
53
+ main()
src/orchestration/langchain_pipeline.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms import HuggingFacePipeline
2
+ from langchain.chains import SequentialChain
3
+
4
+ # Load your fine-tuned HuggingFace model as a pipeline
5
+ llm = HuggingFacePipeline.from_model_id(
6
+ model_id="your-model-name-or-path", # e.g. "yourusername/your-finetuned-model"
7
+ task="text-generation"
8
+ )
9
+
10
+ # Define functions for each step (customize as needed)
11
+ def step_1(input_text):
12
+ # Example step: preprocess input
13
+ return input_text.strip()
14
+
15
+ def step_2(processed_input):
16
+ # Example step: pass through LLM
17
+ return llm(processed_input)
18
+
19
+ def step_3(llm_output):
20
+ # Example step: post-process output
21
+ return llm_output.upper() # just an example
22
+
23
+ # Create a sequential chain from your steps
24
+ steps = [step_1, step_2, step_3]
25
+ chain = SequentialChain(chains=steps)
src/orchestration/workflow.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain import PromptTemplate, LLMChain
2
+ from langchain.llms import HuggingFaceHub
3
+
4
+ def run_workflow(prompt):
5
+ llm = HuggingFaceHub(repo_id="facebook/opt-350m", model_kwargs={"temperature": 0.7})
6
+ template = PromptTemplate(input_variables=["topic"], template="Explain {topic} in simple terms.")
7
+ chain = LLMChain(prompt_template=template, llm=llm)
8
+ response = chain.run(topic=prompt)
9
+ print(response)
10
+
11
+ if __name__ == "__main__":
12
+ run_workflow("parameter-efficient fine-tuning (LoRA)")
src/preprocessing/preprocess_data.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ def preprocess_data(data, model_name="facebook/opt-350m"):
4
+ """
5
+ Tokenizes input text data using chosen tokenizer.
6
+ Args:
7
+ data (list of str): Input sentences.
8
+ model_name (str): Pretrained model tokenizer.
9
+ Returns:
10
+ tokenized dataset (dict): Dictionary of tokenized inputs.
11
+ """
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ tokenized = tokenizer(data, padding=True, truncation=True, return_tensors="pt")
14
+ return tokenized
15
+
16
+ if __name__ == "__main__":
17
+ sample = ["Fine-tuning an open-source LLM.", "This is a sample sentence."]
18
+ tokenized_sample = preprocess_data(sample)
19
+ print(tokenized_sample)
src/utils/logger.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ def setup_logger(name="train_log", log_file="logs/training.log"):
5
+ os.makedirs(os.path.dirname(log_file), exist_ok=True)
6
+ logger = logging.getLogger(name)
7
+ logger.setLevel(logging.INFO)
8
+ fh = logging.FileHandler(log_file)
9
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
10
+ fh.setFormatter(formatter)
11
+ logger.addHandler(fh)
12
+ return logger
13
+
14
+ if __name__ == "__main__":
15
+ log = setup_logger()
16
+ log.info("Logger initialized.")
src/utils/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ def setup_logging(log_dir="logs", log_file="training.log"):
5
+ os.makedirs(log_dir, exist_ok=True)
6
+ log_path = os.path.join(log_dir, log_file)
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format='%(asctime)s - %(levelname)s - %(message)s',
10
+ handlers=[logging.FileHandler(log_path), logging.StreamHandler()]
11
+ )
12
+ return logging.getLogger()
13
+
14
+ if __name__ == "__main__":
15
+ logger = setup_logging()
16
+ logger.info("Logger is set up and ready.")
tests/test_basic.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from src.preprocessing.preprocess_data import preprocess_data
3
+
4
+ class TestPreprocessing(unittest.TestCase):
5
+ def test_tokenize(self):
6
+ sample = ["This is a test."]
7
+ result = preprocess_data(sample)
8
+ self.assertIn("input_ids", result)
9
+
10
+ if __name__ == "__main__":
11
+ unittest.main()
tests/test_preprocessing.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from src.preprocessing import preprocess_data
3
+
4
+ class TestPreprocessing(unittest.TestCase):
5
+ def test_sample(self):
6
+ sample_data = ["Hello World!"]
7
+ processed = preprocess_data(sample_data)
8
+ self.assertIsInstance(processed, list)
9
+ self.assertTrue(all(isinstance(s, str) for s in processed))
10
+
11
+ if __name__ == "__main__":
12
+ unittest.main()