Spaces:
Running
Running
from fastapi import FastAPI, Request, HTTPException | |
from fastapi.responses import HTMLResponse | |
from fastapi.staticfiles import StaticFiles | |
from fastapi.middleware.cors import CORSMiddleware | |
from pydantic import BaseModel | |
import tensorflow as tf | |
import pickle | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
import os | |
from fastapi.responses import JSONResponse | |
# Initialize FastAPI | |
app = FastAPI() | |
cache = "/app/hf_cache" | |
os.makedirs(cache, exist_ok=True) | |
os.environ["HF_HOME"] = cache | |
os.environ["TRANSFORMERS_CACHE"] = cache | |
os.environ["XDG_CACHE_HOME"] = cache | |
from transformers import AutoTokenizer | |
# Load GRU model and tokenizer | |
gru_model = tf.keras.models.load_model('hs_gru.h5') | |
with open('tokenizerpkl_gru.pkl', 'rb') as f: | |
gru_tokenizer = pickle.load(f) | |
gru_maxlen = 100 | |
# Load RoBERTa model | |
# Load RoBERTa model | |
roberta_model_name = "facebook/roberta-hate-speech-dynabench-r4-target" | |
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name) | |
if roberta_tokenizer.pad_token is None: | |
roberta_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name) | |
roberta_model.resize_token_embeddings(len(roberta_tokenizer)) | |
#load toxigen-hatebert model | |
toxigen_model_name = "tomh/toxigen_roberta" | |
toxigen_tokenizer = AutoTokenizer.from_pretrained(toxigen_model_name) | |
if toxigen_tokenizer.pad_token is None: | |
toxigen_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
toxigen_model = AutoModelForSequenceClassification.from_pretrained(toxigen_model_name) | |
toxigen_model.resize_token_embeddings(len(toxigen_tokenizer)) | |
# Enable CORS | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Mount static directory | |
# app.mount("/static", StaticFiles(directory="static"), name="static") | |
# Pydantic input model | |
class TextInput(BaseModel): | |
text: str | |
def read_root(): | |
with open("index.html", "r") as f: | |
return f.read() | |
def health_check(): | |
return {"message": "Hate Speech Detection API is running!"} | |
def predict_ensemble(input: TextInput): | |
try: | |
text = input.text | |
# print(f"Received input: {input.text}") | |
# ----- GRU Prediction ----- | |
seq = gru_tokenizer.texts_to_sequences([text]) | |
padded = pad_sequences(seq, maxlen=gru_maxlen, padding='post') | |
gru_prob = float(gru_model.predict(padded)[0][0]) | |
# ----- RoBERTa Prediction ----- | |
inputs_roberta = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
with torch.no_grad(): | |
logits_roberta = roberta_model(**inputs_roberta).logits | |
probs_roberta = torch.nn.functional.softmax(logits_roberta, dim=1) | |
roberta_prob = float(probs_roberta[0][1].item()) | |
# -----toxigen -hatebert Prediction ----- | |
inputs_toxigen = toxigen_tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
with torch.no_grad(): | |
logits_toxigen = toxigen_model(**inputs_toxigen).logits | |
probs_toxigen = torch.nn.functional.softmax(logits_toxigen, dim=1) | |
toxigen_prob = float(probs_toxigen[0][1].item()) | |
# ----- Weighted Ensemble ----- | |
final_score = (0.3 * gru_prob) + (0.4 * roberta_prob) + (0.3 * toxigen_prob) | |
label = "Hate Speech" if final_score > 0.5 else "Not Hate Speech" | |
return { | |
# "text": text, | |
"gru_prob": round(gru_prob, 4), | |
"roberta_prob": round(roberta_prob, 4), | |
"toxigen_prob": round(toxigen_prob, 4), | |
"final_score": round(final_score, 4), | |
"prediction": label | |
} | |
except Exception as e: | |
print(f"Error during prediction: {str(e)}") | |
return JSONResponse(status_code=500, content={"detail": str(e)}) | |