Spaces:
Running
Running
File size: 4,061 Bytes
b18f4a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
from fastapi.responses import JSONResponse
# Initialize FastAPI
app = FastAPI()
cache = "/app/hf_cache"
os.makedirs(cache, exist_ok=True)
os.environ["HF_HOME"] = cache
os.environ["TRANSFORMERS_CACHE"] = cache
os.environ["XDG_CACHE_HOME"] = cache
from transformers import AutoTokenizer
# Load GRU model and tokenizer
gru_model = tf.keras.models.load_model('hs_gru.h5')
with open('tokenizerpkl_gru.pkl', 'rb') as f:
gru_tokenizer = pickle.load(f)
gru_maxlen = 100
# Load RoBERTa model
# Load RoBERTa model
roberta_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
if roberta_tokenizer.pad_token is None:
roberta_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name)
roberta_model.resize_token_embeddings(len(roberta_tokenizer))
#load toxigen-hatebert model
toxigen_model_name = "tomh/toxigen_roberta"
toxigen_tokenizer = AutoTokenizer.from_pretrained(toxigen_model_name)
if toxigen_tokenizer.pad_token is None:
toxigen_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
toxigen_model = AutoModelForSequenceClassification.from_pretrained(toxigen_model_name)
toxigen_model.resize_token_embeddings(len(toxigen_tokenizer))
# Enable CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Mount static directory
# app.mount("/static", StaticFiles(directory="static"), name="static")
# Pydantic input model
class TextInput(BaseModel):
text: str
@app.get("/", response_class=HTMLResponse)
def read_root():
with open("index.html", "r") as f:
return f.read()
@app.get("/health")
def health_check():
return {"message": "Hate Speech Detection API is running!"}
@app.post("/predict")
def predict_ensemble(input: TextInput):
try:
text = input.text
# print(f"Received input: {input.text}")
# ----- GRU Prediction -----
seq = gru_tokenizer.texts_to_sequences([text])
padded = pad_sequences(seq, maxlen=gru_maxlen, padding='post')
gru_prob = float(gru_model.predict(padded)[0][0])
# ----- RoBERTa Prediction -----
inputs_roberta = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
logits_roberta = roberta_model(**inputs_roberta).logits
probs_roberta = torch.nn.functional.softmax(logits_roberta, dim=1)
roberta_prob = float(probs_roberta[0][1].item())
# -----toxigen -hatebert Prediction -----
inputs_toxigen = toxigen_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
logits_toxigen = toxigen_model(**inputs_toxigen).logits
probs_toxigen = torch.nn.functional.softmax(logits_toxigen, dim=1)
toxigen_prob = float(probs_toxigen[0][1].item())
# ----- Weighted Ensemble -----
final_score = (0.3 * gru_prob) + (0.4 * roberta_prob) + (0.3 * toxigen_prob)
label = "Hate Speech" if final_score > 0.5 else "Not Hate Speech"
return {
# "text": text,
"gru_prob": round(gru_prob, 4),
"roberta_prob": round(roberta_prob, 4),
"toxigen_prob": round(toxigen_prob, 4),
"final_score": round(final_score, 4),
"prediction": label
}
except Exception as e:
print(f"Error during prediction: {str(e)}")
return JSONResponse(status_code=500, content={"detail": str(e)})
|