File size: 3,706 Bytes
ccd1b2a b6b0279 ccd1b2a 8076129 b6b0279 ccd1b2a 8076129 ccd1b2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import warnings
warnings.filterwarnings('ignore')
import os
import gradio as gr
import torch
import tempfile
import numpy as np
import cohere
import spacy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from pdfminer.high_level import extract_text
from nltk.tokenize.texttiling import TextTilingTokenizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Download spaCy model
spacy.cli.download("en_core_web_sm")
co = cohere.Client(os.environ.get("CO_API_KEY"))
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer, AutoModel
# Load models
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")
# Initialize TextTilingTokenizer with default parameters
tiling_tokenizer = TextTilingTokenizer()
def generate_response(prompt, embeddings):
aggregated_embedding = np.mean([np.mean(embed) for embed in embeddings])
embedding_str = f"Embedding summary: {aggregated_embedding:.2f}"
full_prompt = f"{embedding_str}\n\n{prompt}"
try:
response = co.generate(
model="command-xlarge-nightly",
prompt=full_prompt,
max_tokens=750 # Increase the max tokens for a longer response
)
return response.generations[0].text.strip()
except cohere.error.CohereError as e:
return f"An error occurred: {str(e)}"
def extract_text_from_pdf(pdf_path):
return extract_text(pdf_path)
def get_bert_embeddings(texts):
embeddings_list = []
for text in texts:
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
embeddings_list.append(embeddings)
return embeddings_list
def analyze_text(text):
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
return entities, pos_tags, dependencies
def process_pdf_and_generate_response(pdf_file, query):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
with open(pdf_file, 'rb') as f:
temp_file.write(f.read())
temp_file_path = temp_file.name
document_text = extract_text_from_pdf(temp_file_path)
entities, pos_tags, dependencies = analyze_text(document_text)
print("Entities:", entities)
print("POS Tags:", pos_tags)
print("Dependencies:", dependencies)
# Segment the document text using TextTiling
text_chunks = tiling_tokenizer.tokenize(document_text)
# Process document text with InLegalBERT
document_embeddings = get_bert_embeddings(text_chunks)
# Construct prompt for LLM
prompt = f"You are an AI driven research engine for commercial courts, Given the legal document: '{document_text[:2000]}', answer the query : '{query}'"
# Generate response using LLM
response = generate_response(prompt, document_embeddings)
return response
def chunk_long_sentence(sentence):
words = sentence.split()
chunks = []
current_chunk = []
for word in words:
if len(' '.join(current_chunk + [word])) <= 512:
current_chunk.append(word)
else:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
|