File size: 1,211 Bytes
c44a220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import argparse
import pandas as pd
import streamlit as st
from generate_schema import generate_schema
from fetch_data import fetch_real_data
from synthetic_generator import train_and_generate_synthetic

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--prompt", type=str, required=True, help="Describe the dataset you want")
    parser.add_argument("--domain", type=str, default="healthcare", help="Domain to fetch real data from (optional)")
    args = parser.parse_args()

    # Retrieve API token from Streamlit secrets
    hf_token = st.secrets["hf_token"]

    # Step 1: Generate schema using LLM
    schema = generate_schema(args.prompt, hf_token)
    print(f"πŸ“Š Generated schema: {schema}")

    # Step 2: Fetch real data (optional)
    real_data = fetch_real_data(args.domain)

    # Step 3: Preprocess (if necessary)
    real_data = real_data[schema['columns']]  # Match columns from schema
    print(f"βœ… Fetched real data with shape: {real_data.shape}")

    # Step 4: Train GAN and generate synthetic data
    output_path = f"outputs/synthetic_{args.domain}.csv"
    train_and_generate_synthetic(real_data, schema, output_path)

if __name__ == "__main__":
    main()