Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Upload 2 files
Browse filesGPT-2 with GA added
- app.py +242 -105
- requirements.txt +10 -4
    	
        app.py
    CHANGED
    
    | @@ -1,105 +1,242 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
            import  | 
| 3 | 
            -
            import  | 
| 4 | 
            -
            import  | 
| 5 | 
            -
             | 
| 6 | 
            -
            from  | 
| 7 | 
            -
            from  | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
                     | 
| 21 | 
            -
                     | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
                     | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
                     | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
                 | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
                     | 
| 39 | 
            -
                     | 
| 40 | 
            -
                     | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
                 | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
                 | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
                 | 
| 87 | 
            -
                 | 
| 88 | 
            -
                     | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
                     | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
                 | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
            import numpy as np
         | 
| 3 | 
            +
            import random
         | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            import transformers
         | 
| 6 | 
            +
            from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
         | 
| 7 | 
            +
            from datasets import Dataset
         | 
| 8 | 
            +
            import os
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Set random seeds for reproducibility
         | 
| 11 | 
            +
            random.seed(42)
         | 
| 12 | 
            +
            np.random.seed(42)
         | 
| 13 | 
            +
            torch.manual_seed(42)
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            def generate_demo_data(num_samples=60):
         | 
| 16 | 
            +
                # Generate meaningful sentences on various topics
         | 
| 17 | 
            +
                subjects = [
         | 
| 18 | 
            +
                    'Artificial intelligence', 'Climate change', 'Renewable energy',
         | 
| 19 | 
            +
                    'Space exploration', 'Quantum computing', 'Genetic engineering',
         | 
| 20 | 
            +
                    'Blockchain technology', 'Virtual reality', 'Cybersecurity',
         | 
| 21 | 
            +
                    'Biotechnology', 'Nanotechnology', 'Astrophysics'
         | 
| 22 | 
            +
                ]
         | 
| 23 | 
            +
                verbs = [
         | 
| 24 | 
            +
                    'is transforming', 'is influencing', 'is revolutionizing',
         | 
| 25 | 
            +
                    'is challenging', 'is advancing', 'is reshaping', 'is impacting',
         | 
| 26 | 
            +
                    'is enhancing', 'is disrupting', 'is redefining'
         | 
| 27 | 
            +
                ]
         | 
| 28 | 
            +
                objects = [
         | 
| 29 | 
            +
                    'modern science', 'global economies', 'healthcare systems',
         | 
| 30 | 
            +
                    'communication methods', 'educational approaches',
         | 
| 31 | 
            +
                    'environmental policies', 'social interactions', 'the job market',
         | 
| 32 | 
            +
                    'data security', 'the entertainment industry'
         | 
| 33 | 
            +
                ]
         | 
| 34 | 
            +
                data = []
         | 
| 35 | 
            +
                for i in range(num_samples):
         | 
| 36 | 
            +
                    subject = random.choice(subjects)
         | 
| 37 | 
            +
                    verb = random.choice(verbs)
         | 
| 38 | 
            +
                    obj = random.choice(objects)
         | 
| 39 | 
            +
                    sentence = f"{subject} {verb} {obj}."
         | 
| 40 | 
            +
                    data.append(sentence)
         | 
| 41 | 
            +
                return data
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            def load_data(uploaded_file):
         | 
| 44 | 
            +
                # Load user-uploaded text file
         | 
| 45 | 
            +
                data = uploaded_file.read().decode("utf-8")
         | 
| 46 | 
            +
                data = data.splitlines()
         | 
| 47 | 
            +
                return data
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            def prepare_dataset(data, tokenizer, block_size=128):
         | 
| 50 | 
            +
                # Tokenize the texts
         | 
| 51 | 
            +
                def tokenize_function(examples):
         | 
| 52 | 
            +
                    return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length')
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                raw_dataset = Dataset.from_dict({'text': data})
         | 
| 55 | 
            +
                tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                # Create labels for language modeling
         | 
| 58 | 
            +
                tokenized_dataset = tokenized_dataset.map(
         | 
| 59 | 
            +
                    lambda examples: {'labels': examples['input_ids']},
         | 
| 60 | 
            +
                    batched=True
         | 
| 61 | 
            +
                )
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                # Set the format for PyTorch
         | 
| 64 | 
            +
                tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                return tokenized_dataset
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            def fitness_function(individual, train_dataset, model, tokenizer):
         | 
| 69 | 
            +
                # Define the training arguments
         | 
| 70 | 
            +
                training_args = TrainingArguments(
         | 
| 71 | 
            +
                    output_dir='./results',
         | 
| 72 | 
            +
                    overwrite_output_dir=True,
         | 
| 73 | 
            +
                    num_train_epochs=individual['epochs'],
         | 
| 74 | 
            +
                    per_device_train_batch_size=individual['batch_size'],
         | 
| 75 | 
            +
                    learning_rate=individual['learning_rate'],
         | 
| 76 | 
            +
                    logging_steps=10,
         | 
| 77 | 
            +
                    save_steps=10,
         | 
| 78 | 
            +
                    save_total_limit=2,
         | 
| 79 | 
            +
                    report_to='none',  # Disable logging to Wandb or other services
         | 
| 80 | 
            +
                )
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                data_collator = DataCollatorForLanguageModeling(
         | 
| 83 | 
            +
                    tokenizer=tokenizer, mlm=False
         | 
| 84 | 
            +
                )
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                # Train the model
         | 
| 87 | 
            +
                trainer = Trainer(
         | 
| 88 | 
            +
                    model=model,
         | 
| 89 | 
            +
                    args=training_args,
         | 
| 90 | 
            +
                    data_collator=data_collator,
         | 
| 91 | 
            +
                    train_dataset=train_dataset,
         | 
| 92 | 
            +
                    eval_dataset=None,
         | 
| 93 | 
            +
                )
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                trainer.train()
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                # For simplicity, use final training loss as fitness score
         | 
| 98 | 
            +
                logs = [log for log in trainer.state.log_history if 'loss' in log]
         | 
| 99 | 
            +
                if logs:
         | 
| 100 | 
            +
                    loss = logs[-1]['loss']
         | 
| 101 | 
            +
                else:
         | 
| 102 | 
            +
                    loss = float('inf')
         | 
| 103 | 
            +
                return loss
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            # Genetic Algorithm Functions
         | 
| 106 | 
            +
            def create_population(size, param_bounds):
         | 
| 107 | 
            +
                population = []
         | 
| 108 | 
            +
                for _ in range(size):
         | 
| 109 | 
            +
                    individual = {
         | 
| 110 | 
            +
                        'learning_rate': random.uniform(*param_bounds['learning_rate']),
         | 
| 111 | 
            +
                        'epochs': random.randint(*param_bounds['epochs']),
         | 
| 112 | 
            +
                        'batch_size': random.choice(param_bounds['batch_size']),
         | 
| 113 | 
            +
                    }
         | 
| 114 | 
            +
                    population.append(individual)
         | 
| 115 | 
            +
                return population
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            def select_mating_pool(population, fitnesses, num_parents):
         | 
| 118 | 
            +
                parents = [population[i] for i in np.argsort(fitnesses)[:num_parents]]
         | 
| 119 | 
            +
                return parents
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            def crossover(parents, offspring_size):
         | 
| 122 | 
            +
                offspring = []
         | 
| 123 | 
            +
                for _ in range(offspring_size):
         | 
| 124 | 
            +
                    parent1 = random.choice(parents)
         | 
| 125 | 
            +
                    parent2 = random.choice(parents)
         | 
| 126 | 
            +
                    child = {
         | 
| 127 | 
            +
                        'learning_rate': random.choice([parent1['learning_rate'], parent2['learning_rate']]),
         | 
| 128 | 
            +
                        'epochs': random.choice([parent1['epochs'], parent2['epochs']]),
         | 
| 129 | 
            +
                        'batch_size': random.choice([parent1['batch_size'], parent2['batch_size']]),
         | 
| 130 | 
            +
                    }
         | 
| 131 | 
            +
                    offspring.append(child)
         | 
| 132 | 
            +
                return offspring
         | 
| 133 | 
            +
             | 
| 134 | 
            +
            def mutation(offspring, param_bounds, mutation_rate=0.1):
         | 
| 135 | 
            +
                for individual in offspring:
         | 
| 136 | 
            +
                    if random.random() < mutation_rate:
         | 
| 137 | 
            +
                        individual['learning_rate'] = random.uniform(*param_bounds['learning_rate'])
         | 
| 138 | 
            +
                    if random.random() < mutation_rate:
         | 
| 139 | 
            +
                        individual['epochs'] = random.randint(*param_bounds['epochs'])
         | 
| 140 | 
            +
                    if random.random() < mutation_rate:
         | 
| 141 | 
            +
                        individual['batch_size'] = random.choice(param_bounds['batch_size'])
         | 
| 142 | 
            +
                return offspring
         | 
| 143 | 
            +
             | 
| 144 | 
            +
            # Streamlit App
         | 
| 145 | 
            +
            def main():
         | 
| 146 | 
            +
                st.title("GPT-2 Fine-Tuning with Genetic Algorithm")
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                option = st.sidebar.selectbox(
         | 
| 149 | 
            +
                    'Choose Data Source',
         | 
| 150 | 
            +
                    ('DEMO', 'Upload Text File')
         | 
| 151 | 
            +
                )
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                if option == 'DEMO':
         | 
| 154 | 
            +
                    st.write("Using DEMO data...")
         | 
| 155 | 
            +
                    data = generate_demo_data()
         | 
| 156 | 
            +
                else:
         | 
| 157 | 
            +
                    st.write("Upload a text file for fine-tuning.")
         | 
| 158 | 
            +
                    uploaded_file = st.file_uploader("Choose a text file", type="txt")
         | 
| 159 | 
            +
                    if uploaded_file is not None:
         | 
| 160 | 
            +
                        data = load_data(uploaded_file)
         | 
| 161 | 
            +
                    else:
         | 
| 162 | 
            +
                        st.warning("Please upload a text file.")
         | 
| 163 | 
            +
                        st.stop()
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                # Load tokenizer and model
         | 
| 166 | 
            +
                st.write("Loading GPT-2 tokenizer and model...")
         | 
| 167 | 
            +
                tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         | 
| 168 | 
            +
                model = GPT2LMHeadModel.from_pretrained('gpt2')
         | 
| 169 | 
            +
                model.to('cuda' if torch.cuda.is_available() else 'cpu')
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                # Set the pad token
         | 
| 172 | 
            +
                tokenizer.pad_token = tokenizer.eos_token
         | 
| 173 | 
            +
                model.config.pad_token_id = model.config.eos_token_id
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                # Prepare dataset
         | 
| 176 | 
            +
                st.write("Preparing dataset...")
         | 
| 177 | 
            +
                train_dataset = prepare_dataset(data, tokenizer)
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                # GA Parameters
         | 
| 180 | 
            +
                st.sidebar.subheader("Genetic Algorithm Parameters")
         | 
| 181 | 
            +
                population_size = st.sidebar.number_input("Population Size", 4, 20, 6)
         | 
| 182 | 
            +
                num_generations = st.sidebar.number_input("Number of Generations", 1, 10, 3)
         | 
| 183 | 
            +
                num_parents = st.sidebar.number_input("Number of Parents", 2, population_size, 2)
         | 
| 184 | 
            +
                mutation_rate = st.sidebar.slider("Mutation Rate", 0.0, 1.0, 0.1)
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                # Hyperparameter bounds
         | 
| 187 | 
            +
                param_bounds = {
         | 
| 188 | 
            +
                    'learning_rate': (1e-5, 5e-5),
         | 
| 189 | 
            +
                    'epochs': (1, 3),
         | 
| 190 | 
            +
                    'batch_size': [2, 4, 8]
         | 
| 191 | 
            +
                }
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                if st.button("Start Training"):
         | 
| 194 | 
            +
                    st.write("Initializing Genetic Algorithm...")
         | 
| 195 | 
            +
                    population = create_population(population_size, param_bounds)
         | 
| 196 | 
            +
                    best_individual = None
         | 
| 197 | 
            +
                    best_fitness = float('inf')
         | 
| 198 | 
            +
                    fitness_history = []
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                    progress_bar = st.progress(0)
         | 
| 201 | 
            +
                    status_text = st.empty()
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    total_evaluations = num_generations * len(population)
         | 
| 204 | 
            +
                    current_evaluation = 0
         | 
| 205 | 
            +
             | 
| 206 | 
            +
                    for generation in range(num_generations):
         | 
| 207 | 
            +
                        st.write(f"Generation {generation+1}/{num_generations}")
         | 
| 208 | 
            +
                        fitnesses = []
         | 
| 209 | 
            +
                        for idx, individual in enumerate(population):
         | 
| 210 | 
            +
                            status_text.text(f"Evaluating individual {idx+1}/{len(population)} in generation {generation+1}")
         | 
| 211 | 
            +
                            # Clone the model to avoid reusing the same model
         | 
| 212 | 
            +
                            model_clone = GPT2LMHeadModel.from_pretrained('gpt2')
         | 
| 213 | 
            +
                            model_clone.to('cuda' if torch.cuda.is_available() else 'cpu')
         | 
| 214 | 
            +
                            fitness = fitness_function(individual, train_dataset, model_clone, tokenizer)
         | 
| 215 | 
            +
                            fitnesses.append(fitness)
         | 
| 216 | 
            +
                            if fitness < best_fitness:
         | 
| 217 | 
            +
                                best_fitness = fitness
         | 
| 218 | 
            +
                                best_individual = individual
         | 
| 219 | 
            +
                            current_evaluation += 1
         | 
| 220 | 
            +
                            progress_bar.progress(current_evaluation / total_evaluations)
         | 
| 221 | 
            +
                        fitness_history.append(min(fitnesses))
         | 
| 222 | 
            +
                        parents = select_mating_pool(population, fitnesses, num_parents)
         | 
| 223 | 
            +
                        offspring_size = population_size - num_parents
         | 
| 224 | 
            +
                        offspring = crossover(parents, offspring_size)
         | 
| 225 | 
            +
                        offspring = mutation(offspring, param_bounds, mutation_rate)
         | 
| 226 | 
            +
                        population = parents + offspring
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                    st.write("Training completed!")
         | 
| 229 | 
            +
                    st.write(f"Best Hyperparameters: {best_individual}")
         | 
| 230 | 
            +
                    st.write(f"Best Fitness (Loss): {best_fitness}")
         | 
| 231 | 
            +
             | 
| 232 | 
            +
                    # Plot fitness history
         | 
| 233 | 
            +
                    st.line_chart(fitness_history)
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                    # Save the best model
         | 
| 236 | 
            +
                    if st.button("Save Model"):
         | 
| 237 | 
            +
                        model_clone.save_pretrained('./fine_tuned_model')
         | 
| 238 | 
            +
                        tokenizer.save_pretrained('./fine_tuned_model')
         | 
| 239 | 
            +
                        st.write("Model saved successfully!")
         | 
| 240 | 
            +
             | 
| 241 | 
            +
            if __name__ == "__main__":
         | 
| 242 | 
            +
                main()
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,4 +1,10 @@ | |
| 1 | 
            -
            streamlit
         | 
| 2 | 
            -
            numpy
         | 
| 3 | 
            -
            tensorflow
         | 
| 4 | 
            -
            scikit-learn
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            streamlit
         | 
| 2 | 
            +
            numpy
         | 
| 3 | 
            +
            tensorflow
         | 
| 4 | 
            +
            scikit-learn
         | 
| 5 | 
            +
            transformers
         | 
| 6 | 
            +
            torch
         | 
| 7 | 
            +
            accelerate
         | 
| 8 | 
            +
            datasets
         | 
| 9 | 
            +
            tf-keras
         | 
| 10 | 
            +
             | 

