Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						6646b26
	
0
								Parent(s):
							
							
Duplicate from somosnlp/somos-alpaca-es
Browse filesCo-authored-by: Daniel Vila <[email protected]>
- .gitattributes +34 -0
 - Dockerfile +7 -0
 - README.md +13 -0
 - load_data.py +99 -0
 
    	
        .gitattributes
    ADDED
    
    | 
         @@ -0,0 +1,34 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            *.7z filter=lfs diff=lfs merge=lfs -text
         
     | 
| 2 | 
         
            +
            *.arrow filter=lfs diff=lfs merge=lfs -text
         
     | 
| 3 | 
         
            +
            *.bin filter=lfs diff=lfs merge=lfs -text
         
     | 
| 4 | 
         
            +
            *.bz2 filter=lfs diff=lfs merge=lfs -text
         
     | 
| 5 | 
         
            +
            *.ckpt filter=lfs diff=lfs merge=lfs -text
         
     | 
| 6 | 
         
            +
            *.ftz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 7 | 
         
            +
            *.gz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 8 | 
         
            +
            *.h5 filter=lfs diff=lfs merge=lfs -text
         
     | 
| 9 | 
         
            +
            *.joblib filter=lfs diff=lfs merge=lfs -text
         
     | 
| 10 | 
         
            +
            *.lfs.* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 11 | 
         
            +
            *.mlmodel filter=lfs diff=lfs merge=lfs -text
         
     | 
| 12 | 
         
            +
            *.model filter=lfs diff=lfs merge=lfs -text
         
     | 
| 13 | 
         
            +
            *.msgpack filter=lfs diff=lfs merge=lfs -text
         
     | 
| 14 | 
         
            +
            *.npy filter=lfs diff=lfs merge=lfs -text
         
     | 
| 15 | 
         
            +
            *.npz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 16 | 
         
            +
            *.onnx filter=lfs diff=lfs merge=lfs -text
         
     | 
| 17 | 
         
            +
            *.ot filter=lfs diff=lfs merge=lfs -text
         
     | 
| 18 | 
         
            +
            *.parquet filter=lfs diff=lfs merge=lfs -text
         
     | 
| 19 | 
         
            +
            *.pb filter=lfs diff=lfs merge=lfs -text
         
     | 
| 20 | 
         
            +
            *.pickle filter=lfs diff=lfs merge=lfs -text
         
     | 
| 21 | 
         
            +
            *.pkl filter=lfs diff=lfs merge=lfs -text
         
     | 
| 22 | 
         
            +
            *.pt filter=lfs diff=lfs merge=lfs -text
         
     | 
| 23 | 
         
            +
            *.pth filter=lfs diff=lfs merge=lfs -text
         
     | 
| 24 | 
         
            +
            *.rar filter=lfs diff=lfs merge=lfs -text
         
     | 
| 25 | 
         
            +
            *.safetensors filter=lfs diff=lfs merge=lfs -text
         
     | 
| 26 | 
         
            +
            saved_model/**/* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 27 | 
         
            +
            *.tar.* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 28 | 
         
            +
            *.tflite filter=lfs diff=lfs merge=lfs -text
         
     | 
| 29 | 
         
            +
            *.tgz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 30 | 
         
            +
            *.wasm filter=lfs diff=lfs merge=lfs -text
         
     | 
| 31 | 
         
            +
            *.xz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 32 | 
         
            +
            *.zip filter=lfs diff=lfs merge=lfs -text
         
     | 
| 33 | 
         
            +
            *.zst filter=lfs diff=lfs merge=lfs -text
         
     | 
| 34 | 
         
            +
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         
     | 
    	
        Dockerfile
    ADDED
    
    | 
         @@ -0,0 +1,7 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            FROM argilla/argilla-quickstart:latest
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            COPY load_data.py /
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            RUN pip install argilla[listeners]
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            CMD whoami && /start_quickstart_argilla.sh
         
     | 
    	
        README.md
    ADDED
    
    | 
         @@ -0,0 +1,13 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            ---
         
     | 
| 2 | 
         
            +
            title: Hackathon SomosNLP Reto Datasets LLM Español
         
     | 
| 3 | 
         
            +
            emoji: 🦙 🏷️
         
     | 
| 4 | 
         
            +
            colorFrom: purple
         
     | 
| 5 | 
         
            +
            colorTo: red
         
     | 
| 6 | 
         
            +
            sdk: docker
         
     | 
| 7 | 
         
            +
            app_port: 6900
         
     | 
| 8 | 
         
            +
            fullWidth: true
         
     | 
| 9 | 
         
            +
            tags:
         
     | 
| 10 | 
         
            +
            - argilla
         
     | 
| 11 | 
         
            +
            - somosnlp
         
     | 
| 12 | 
         
            +
            duplicated_from: somosnlp/somos-alpaca-es
         
     | 
| 13 | 
         
            +
            ---
         
     | 
    	
        load_data.py
    ADDED
    
    | 
         @@ -0,0 +1,99 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import sys
         
     | 
| 2 | 
         
            +
            import time
         
     | 
| 3 | 
         
            +
            import os
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            import argilla as rg
         
     | 
| 6 | 
         
            +
            import pandas as pd
         
     | 
| 7 | 
         
            +
            import requests
         
     | 
| 8 | 
         
            +
            from datasets import load_dataset, concatenate_datasets
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            from argilla.listeners import listener
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            HF_TOKEN = os.environ.get("HF_TOKEN")
         
     | 
| 13 | 
         
            +
            HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME')
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            @listener(
         
     | 
| 16 | 
         
            +
                dataset="somos-alpaca-es", 
         
     | 
| 17 | 
         
            +
                query="status:Validated", # https://docs.argilla.io/en/latest/guides/features/queries.html
         
     | 
| 18 | 
         
            +
                execution_interval_in_seconds=1200, # interval to check the execution of `save_validated_to_hub`
         
     | 
| 19 | 
         
            +
            )
         
     | 
| 20 | 
         
            +
            def save_validated_to_hub(records, ctx):
         
     | 
| 21 | 
         
            +
                if len(records) > 0:
         
     | 
| 22 | 
         
            +
                    ds = rg.DatasetForTextClassification(records=records).to_datasets()   
         
     | 
| 23 | 
         
            +
                    if HF_TOKEN:
         
     | 
| 24 | 
         
            +
                        print("Pushing the dataset")
         
     | 
| 25 | 
         
            +
                        print(ds)
         
     | 
| 26 | 
         
            +
                        ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN)
         
     | 
| 27 | 
         
            +
                    else:
         
     | 
| 28 | 
         
            +
                        print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
         
     | 
| 29 | 
         
            +
                else:
         
     | 
| 30 | 
         
            +
                    print("NO RECORDS found")
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
            class LoadDatasets:
         
     | 
| 33 | 
         
            +
                def __init__(self, api_key, workspace="team"):
         
     | 
| 34 | 
         
            +
                    rg.init(api_key=api_key, workspace=workspace)
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
                @staticmethod
         
     | 
| 37 | 
         
            +
                def load_somos():
         
     | 
| 38 | 
         
            +
                    # Leer el dataset del Hub
         
     | 
| 39 | 
         
            +
                    try:
         
     | 
| 40 | 
         
            +
                        print(f"Trying to sync with {HUB_DATASET_NAME}")
         
     | 
| 41 | 
         
            +
                        old_ds = load_dataset(HUB_DATASET_NAME, split="train")
         
     | 
| 42 | 
         
            +
                    except Exception as e:
         
     | 
| 43 | 
         
            +
                        print(f"Not possible to sync with {HUB_DATASET_NAME}")
         
     | 
| 44 | 
         
            +
                        print(e)
         
     | 
| 45 | 
         
            +
                        old_ds = None
         
     | 
| 46 | 
         
            +
                        
         
     | 
| 47 | 
         
            +
                    dataset = load_dataset("somosnlp/somos-clean-alpaca-es", split="train")
         
     | 
| 48 | 
         
            +
                
         
     | 
| 49 | 
         
            +
                    
         
     | 
| 50 | 
         
            +
                    if old_ds:
         
     | 
| 51 | 
         
            +
                        print("Concatenating datasets")
         
     | 
| 52 | 
         
            +
                        dataset = concatenate_datasets([dataset, old_ds])
         
     | 
| 53 | 
         
            +
                        print("Concatenated dataset is:")
         
     | 
| 54 | 
         
            +
                        print(dataset)
         
     | 
| 55 | 
         
            +
                        
         
     | 
| 56 | 
         
            +
                    dataset = dataset.remove_columns("metrics")
         
     | 
| 57 | 
         
            +
                    records = rg.DatasetForTextClassification.from_datasets(dataset)
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
                    settings = rg.TextClassificationSettings(
         
     | 
| 60 | 
         
            +
                        label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD"]
         
     | 
| 61 | 
         
            +
                    )
         
     | 
| 62 | 
         
            +
                    rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team")
         
     | 
| 63 | 
         
            +
                    
         
     | 
| 64 | 
         
            +
                    # Log the dataset
         
     | 
| 65 | 
         
            +
                    rg.log(
         
     | 
| 66 | 
         
            +
                        records,
         
     | 
| 67 | 
         
            +
                        name="somos-alpaca-es",
         
     | 
| 68 | 
         
            +
                        tags={"description": "SomosNLP Hackathon dataset"},
         
     | 
| 69 | 
         
            +
                        batch_size=200
         
     | 
| 70 | 
         
            +
                    )
         
     | 
| 71 | 
         
            +
                    
         
     | 
| 72 | 
         
            +
                    # run listener
         
     | 
| 73 | 
         
            +
                    save_validated_to_hub.start()
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 76 | 
         
            +
                API_KEY = sys.argv[1]
         
     | 
| 77 | 
         
            +
                LOAD_DATASETS = sys.argv[2]
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
                if LOAD_DATASETS.lower() == "none":
         
     | 
| 80 | 
         
            +
                    print("No datasets being loaded")
         
     | 
| 81 | 
         
            +
                else:
         
     | 
| 82 | 
         
            +
                    while True:
         
     | 
| 83 | 
         
            +
                        try:
         
     | 
| 84 | 
         
            +
                            response = requests.get("http://0.0.0.0:6900/")
         
     | 
| 85 | 
         
            +
                            if response.status_code == 200:
         
     | 
| 86 | 
         
            +
                                ld = LoadDatasets(API_KEY)
         
     | 
| 87 | 
         
            +
                                ld.load_somos()
         
     | 
| 88 | 
         
            +
                                break
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
                        except requests.exceptions.ConnectionError:
         
     | 
| 91 | 
         
            +
                            pass
         
     | 
| 92 | 
         
            +
                        except Exception as e:
         
     | 
| 93 | 
         
            +
                            print(e)
         
     | 
| 94 | 
         
            +
                            time.sleep(10)
         
     | 
| 95 | 
         
            +
                            pass
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
                        time.sleep(5)
         
     | 
| 98 | 
         
            +
                while True:
         
     | 
| 99 | 
         
            +
                    time.sleep(60)
         
     |