File size: 1,211 Bytes
c44a220 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import argparse
import pandas as pd
import streamlit as st
from generate_schema import generate_schema
from fetch_data import fetch_real_data
from synthetic_generator import train_and_generate_synthetic
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--prompt", type=str, required=True, help="Describe the dataset you want")
parser.add_argument("--domain", type=str, default="healthcare", help="Domain to fetch real data from (optional)")
args = parser.parse_args()
# Retrieve API token from Streamlit secrets
hf_token = st.secrets["hf_token"]
# Step 1: Generate schema using LLM
schema = generate_schema(args.prompt, hf_token)
print(f"π Generated schema: {schema}")
# Step 2: Fetch real data (optional)
real_data = fetch_real_data(args.domain)
# Step 3: Preprocess (if necessary)
real_data = real_data[schema['columns']] # Match columns from schema
print(f"β
Fetched real data with shape: {real_data.shape}")
# Step 4: Train GAN and generate synthetic data
output_path = f"outputs/synthetic_{args.domain}.csv"
train_and_generate_synthetic(real_data, schema, output_path)
if __name__ == "__main__":
main()
|