electric-otter's picture
Update nlp.py
abd995d verified
import requests
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense
import numpy as np
import random
# List of predefined topics, their queries, and corresponding URLs
topics = {
"Technology": {
"query": "latest technology news",
"urls": [
"https://geeksforgeeks.org",
"https://theverge.com",
]
},
"Science": {
"query": "latest science discoveries",
"urls": [
"https://oercommons.org/hubs/NSDL",
]
},
"History": {
"query": "historical events",
"urls": [
"https://history.com",
]
},
"Math": {
"query": "",
"urls": []
}
}
# Randomly select a topic
selected_topic = random.choice(list(topics.keys()))
print(f"Selected topic: {selected_topic}")
# Fetch data from predefined URLs with queries
def fetch_data(url, query):
search_url = f"{url}/search?q={query}"
response = requests.get(search_url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.get_text()
# Function to perform basic arithmetic operations
def solve_math_problem():
operations = ['+', '-', '*', '/']
num1 = random.randint(1, 100)
num2 = random.randint(1, 100)
operation = random.choice(operations)
problem = f"{num1} {operation} {num2}"
# Safeguard division by zero
if operation == '/':
while num2 == 0:
num2 = random.randint(1, 100)
problem = f"{num1} {operation} {num2}"
solution = eval(problem)
return problem, solution
# Load data or generate math problem
if selected_topic != "Math":
data = ""
for url in topics[selected_topic]["urls"]:
data += fetch_data(url, topics[selected_topic]["query"])
else:
# Create a dummy data string for tokenization and sequence generation
data = "This is a sample text for math topic."
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
sequence_data = tokenizer.texts_to_sequences([data])[0]
# Creating sequences
X = []
Y = []
for i in range(0, len(sequence_data) - 1):
X.append(sequence_data[i:i+1])
Y.append(sequence_data[i+1])
# Padding sequences
X = pad_sequences(X, maxlen=1)
Y = np.array(Y)
# Defining a lighter model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 10, input_length=1))
model.add(LSTM(10)) # Reduce LSTM size
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))
# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Training the model with fewer epochs
model.fit(X, Y, epochs=10, verbose=1) # Reduced epochs
# Function to generate text based on input
def generate_text(model, tokenizer, max_sequence_len, input_text, num_words):
for _ in range(num_words):
token_list = tokenizer.texts_to_sequences([input_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
predicted = model.predict_classes(token_list, verbose=0)
output_word = ""
for word, index in tokenizer.word_index.items():
if index == predicted:
output_word = word
break
input_text += " " + output_word
return input_text
# Get initial input text and number of words to generate
initial_input_text = "This is a generated text"
num_words = 100 # Number of words to generate
# Generate text
generated_text = generate_text(model, tokenizer, 2, initial_input_text, num_words)
# Append the math problem and solution to the generated text if the topic is math
if selected_topic == "Math":
math_problem, math_solution = solve_math_problem()
final_text = f"{generated_text}\n\nMath Problem: {math_problem}\nSolution: {math_solution}"
else:
final_text = generated_text
print(final_text)