promptsbyesha commited on 16 days ago

Commit

72f4d4d

verified ·

1 Parent(s): ab4aa96

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.code-workspace +8 -0
.gitignore +7 -0
.vscode/settings.json +5 -0
README.md +13 -3
configs/train_config.yaml +10 -0
data/external +0 -0
data/processed +0 -0
data/raw +0 -0
demo_app/app.py +17 -0
docs/setup_guide.md +23 -0
docs/usage.md +13 -0
environment.local.yml +11 -0
notebooks/exploratory +0 -0
notebooks/inference +0 -0
notebooks/modeling +0 -0
requirements.txt +6 -0
src/cloud_deploy/deploy.py +5 -0
src/cloud_deploy/deploy_aws.py +6 -0
src/eval/evaluate.py +18 -0
src/model/train.py +53 -0
src/orchestration/langchain_pipeline.py +25 -0
src/orchestration/workflow.py +12 -0
src/preprocessing/preprocess_data.py +19 -0
src/utils/logger.py +16 -0
src/utils/utils.py +16 -0
tests/test_basic.py +11 -0
tests/test_preprocessing.py +12 -0

.code-workspace ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "folders": [
+        { "path": "C:/Users/hp/Desktop/llm-finetuning-cloud-orchestration" }
+    ],
+    "settings": {
+        "python.pythonPath": "env/bin/python"
+    }
+}

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+*.pyc
+.env
+env/
+logs/
+models/
+results/

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "cSpell.words": [
+        "langchain"
+    ]
+}

README.md CHANGED Viewed

@@ -1,3 +1,13 @@
----
-license: apache-2.0
----

+# Finetuning an Open-Source LLM
+This project adapts large language models to domain-specific tasks, leveraging parameter-efficient techniques (LoRA/QLoRA), cloud deployment, and workflow orchestration.
+## Getting Started
+- Clone this repo
+- Install Python dependencies
+- See `demo_app/app.py` to launch the demo
+## Structure
+- Models: Fine-tuned checkpoints
+- Demo App: Streamlit/Gradio interface
+- Configs: Training/deployment configs

configs/train_config.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+model_name: "llama-base"
+learning_rate: 5e-5
+batch_size: 16
+num_epochs: 3
+lora:
+  rank: 8
+  alpha: 16
+  dropout: 0.1
+dataset_path: "data/processed/train.json"
+output_dir: "models/llm-finetuned"

data/external ADDED Viewed

File without changes

data/processed ADDED Viewed

File without changes

data/raw ADDED Viewed

File without changes

demo_app/app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import streamlit as st
+from transformers import pipeline
+st.title("Fine-tuned LLM Demo")
+# Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
+generator = pipeline('text-generation', model='gpt2')
+prompt = st.text_input("Enter prompt:")
+# Add a slider to select number of responses
+num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
+if prompt:
+    results = generator(prompt, max_length=100, num_return_sequences=num_responses)
+    for i, result in enumerate(results):
+        st.write(f"Output {i+1}: {result['generated_text']}")

docs/setup_guide.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Project Setup Guide
+## Installation
+1. Create Python environment:
+python -m venv env
+source env/bin/activate # Linux/macOS
+.\env\Scripts\activate # Windows
+2. Install dependencies:
+pip install -r requirements.txt
+3. Run training example:
+python src/model/train.py
+---
+## Usage
+- Run demo app:
+streamlit run demo_app/app.py
+undefined

docs/usage.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# Project Usage
+## Setting Up
+1. Install dependencies: `pip install -r requirements.txt`
+2. Prepare dataset in `data/processed/`
+3. Run training: `python src/model/train.py --config configs/train_config.yaml`
+4. Launch demo app: `streamlit run demo_app/app.py`
+## Project Structure
+- `models/`: Fine-tuned checkpoints
+- `demo_app/`: Demo frontend interface
+- `configs/`: Configs and hyperparameters
+- `tests/`: Unit tests

environment.local.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: llm-finetune-env
+channels:
+  - defaults
+  - conda-forge
+dependencies:
+  - python=3.10
+  - pytorch
+  - transformers
+  - streamlit
+  - gradio
+  - langchain

notebooks/exploratory ADDED Viewed

File without changes

notebooks/inference ADDED Viewed

File without changes

notebooks/modeling ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+torch
+datasets
+streamlit
+gradio
+langchain

src/cloud_deploy/deploy.py ADDED Viewed

	@@ -0,0 +1,5 @@

+def deploy_to_aws():
+    print("Placeholder: Add AWS deployment code here")
+if __name__ == "__main__":
+    deploy_to_aws()

src/cloud_deploy/deploy_aws.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def deploy_model_aws():
+    # Placeholder for AWS deployment logic using boto3, AWS SDK, or AWS CLI commands
+    print("Deploying model to AWS... (implement deployment logic here)")
+if __name__ == "__main__":
+    deploy_model_aws()

src/eval/evaluate.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+def evaluate_model(model_path, test_sentences):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path)
+    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    for sentence in test_sentences:
+        output = generator(sentence, max_length=50, num_return_sequences=1)
+        print(f"Input: {sentence}\nOutput: {output[0]['generated_text']}\n")
+if __name__ == "__main__":
+    test_samples = [
+        "How does fine-tuning work?",
+        "Explain parameter-efficient methods like LoRA."
+    ]
+    evaluate_model("models/llm-finetuned", test_samples)

src/model/train.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
+from datasets import load_dataset
+import os
+import json
+def load_config(config_path):
+    with open(config_path, "r") as f:
+        return json.load(f)
+def main():
+    config = load_config("configs/train_config.yaml")
+    model_name = config["model_name"]
+    dataset_path = config["dataset_path"]
+    output_dir = config["output_dir"]
+    learning_rate = config["learning_rate"]
+    batch_size = config["batch_size"]
+    num_epochs = config["num_epochs"]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    # Example dataset loading; replace with your data loading pipeline
+    dataset = load_dataset("json", data_files={"train": dataset_path})
+    def tokenize_function(examples):
+        return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        overwrite_output_dir=True,
+        evaluation_strategy="epoch",
+        learning_rate=learning_rate,
+        per_device_train_batch_size=batch_size,
+        num_train_epochs=num_epochs,
+        save_strategy="epoch",
+        logging_dir='./logs',
+        logging_steps=10,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset["train"],
+        tokenizer=tokenizer,
+    )
+    trainer.train()
+    trainer.save_model(output_dir)
+if __name__ == "__main__":
+    main()

src/orchestration/langchain_pipeline.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from langchain.llms import HuggingFacePipeline
+from langchain.chains import SequentialChain
+# Load your fine-tuned HuggingFace model as a pipeline
+llm = HuggingFacePipeline.from_model_id(
+    model_id="your-model-name-or-path",           # e.g. "yourusername/your-finetuned-model"
+    task="text-generation"
+)
+# Define functions for each step (customize as needed)
+def step_1(input_text):
+    # Example step: preprocess input
+    return input_text.strip()
+def step_2(processed_input):
+    # Example step: pass through LLM
+    return llm(processed_input)
+def step_3(llm_output):
+    # Example step: post-process output
+    return llm_output.upper()  # just an example
+# Create a sequential chain from your steps
+steps = [step_1, step_2, step_3]
+chain = SequentialChain(chains=steps)

src/orchestration/workflow.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from langchain import PromptTemplate, LLMChain
+from langchain.llms import HuggingFaceHub
+def run_workflow(prompt):
+    llm = HuggingFaceHub(repo_id="facebook/opt-350m", model_kwargs={"temperature": 0.7})
+    template = PromptTemplate(input_variables=["topic"], template="Explain {topic} in simple terms.")
+    chain = LLMChain(prompt_template=template, llm=llm)
+    response = chain.run(topic=prompt)
+    print(response)
+if __name__ == "__main__":
+    run_workflow("parameter-efficient fine-tuning (LoRA)")

src/preprocessing/preprocess_data.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from transformers import AutoTokenizer
+def preprocess_data(data, model_name="facebook/opt-350m"):
+    """
+    Tokenizes input text data using chosen tokenizer.
+    Args:
+        data (list of str): Input sentences.
+        model_name (str): Pretrained model tokenizer.
+    Returns:
+        tokenized dataset (dict): Dictionary of tokenized inputs.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenized = tokenizer(data, padding=True, truncation=True, return_tensors="pt")
+    return tokenized
+if __name__ == "__main__":
+    sample = ["Fine-tuning an open-source LLM.", "This is a sample sentence."]
+    tokenized_sample = preprocess_data(sample)
+    print(tokenized_sample)

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import logging
+import os
+def setup_logger(name="train_log", log_file="logs/training.log"):
+    os.makedirs(os.path.dirname(log_file), exist_ok=True)
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    fh = logging.FileHandler(log_file)
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+    return logger
+if __name__ == "__main__":
+    log = setup_logger()
+    log.info("Logger initialized.")

src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import logging
+def setup_logging(log_dir="logs", log_file="training.log"):
+    os.makedirs(log_dir, exist_ok=True)
+    log_path = os.path.join(log_dir, log_file)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[logging.FileHandler(log_path), logging.StreamHandler()]
+    )
+    return logging.getLogger()
+if __name__ == "__main__":
+    logger = setup_logging()
+    logger.info("Logger is set up and ready.")

tests/test_basic.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import unittest
+from src.preprocessing.preprocess_data import preprocess_data
+class TestPreprocessing(unittest.TestCase):
+    def test_tokenize(self):
+        sample = ["This is a test."]
+        result = preprocess_data(sample)
+        self.assertIn("input_ids", result)
+if __name__ == "__main__":
+    unittest.main()

tests/test_preprocessing.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import unittest
+from src.preprocessing import preprocess_data
+class TestPreprocessing(unittest.TestCase):
+    def test_sample(self):
+        sample_data = ["Hello World!"]
+        processed = preprocess_data(sample_data)
+        self.assertIsInstance(processed, list)
+        self.assertTrue(all(isinstance(s, str) for s in processed))
+if __name__ == "__main__":
+    unittest.main()