Spaces:

Zayed024
/

AI-driven-research-engine-for-commercial-courts

Build error

AI-driven-research-engine-for-commercial-courts

File size: 3,706 Bytes

ccd1b2a
 
b6b0279
ccd1b2a
 
8076129
b6b0279
ccd1b2a
 
 
 
 
 
 
 
 
 
8076129
 
 
 
 
 
ccd1b2a

import warnings
warnings.filterwarnings('ignore')

import os
import gradio as gr

import torch
import tempfile
import numpy as np
import cohere
import spacy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from pdfminer.high_level import extract_text
from nltk.tokenize.texttiling import TextTilingTokenizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Download spaCy model
spacy.cli.download("en_core_web_sm")

co = cohere.Client(os.environ.get("CO_API_KEY"))

nlp = spacy.load("en_core_web_sm")

from transformers import AutoTokenizer, AutoModel

# Load models
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

# Initialize TextTilingTokenizer with default parameters
tiling_tokenizer = TextTilingTokenizer()

def generate_response(prompt, embeddings):
    aggregated_embedding = np.mean([np.mean(embed) for embed in embeddings])
    embedding_str = f"Embedding summary: {aggregated_embedding:.2f}"
    
    full_prompt = f"{embedding_str}\n\n{prompt}"
    
    try:
        response = co.generate(
            model="command-xlarge-nightly",
            prompt=full_prompt,
            max_tokens=750  # Increase the max tokens for a longer response
        )
        return response.generations[0].text.strip()
    
    except cohere.error.CohereError as e:
        return f"An error occurred: {str(e)}"

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def get_bert_embeddings(texts):
    embeddings_list = []

    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings_list.append(embeddings)

    return embeddings_list

def analyze_text(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return entities, pos_tags, dependencies

def process_pdf_and_generate_response(pdf_file, query):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        with open(pdf_file, 'rb') as f:
            temp_file.write(f.read())
            temp_file_path = temp_file.name

    document_text = extract_text_from_pdf(temp_file_path)

    entities, pos_tags, dependencies = analyze_text(document_text)
    
    print("Entities:", entities)
    print("POS Tags:", pos_tags)
    print("Dependencies:", dependencies)
    
    # Segment the document text using TextTiling
    text_chunks = tiling_tokenizer.tokenize(document_text)
    
    # Process document text with InLegalBERT
    document_embeddings = get_bert_embeddings(text_chunks)
    
    # Construct prompt for LLM
    prompt = f"You are an AI driven research engine for commercial courts, Given the legal document: '{document_text[:2000]}', answer the query : '{query}'"
    
    # Generate response using LLM
    response = generate_response(prompt, document_embeddings)
    
    return response

def chunk_long_sentence(sentence):
    words = sentence.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        if len(' '.join(current_chunk + [word])) <= 512:
            current_chunk.append(word)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks