Update nlp.py

abd995d verified 8 months ago

4.05 kB

	import requests
	from bs4 import BeautifulSoup
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from keras.models import Sequential
	from keras.layers import LSTM, Embedding, Dense
	import numpy as np
	import random

	# List of predefined topics, their queries, and corresponding URLs
	topics = {
	"Technology": {
	"query": "latest technology news",
	"urls": [
	"https://geeksforgeeks.org",
	"https://theverge.com",
	]
	},
	"Science": {
	"query": "latest science discoveries",
	"urls": [
	"https://oercommons.org/hubs/NSDL",
	]
	},
	"History": {
	"query": "historical events",
	"urls": [
	"https://history.com",
	]
	},
	"Math": {
	"query": "",
	"urls": []
	}
	}

	# Randomly select a topic
	selected_topic = random.choice(list(topics.keys()))
	print(f"Selected topic: {selected_topic}")

	# Fetch data from predefined URLs with queries
	def fetch_data(url, query):
	search_url = f"{url}/search?q={query}"
	response = requests.get(search_url)
	soup = BeautifulSoup(response.content, 'html.parser')
	return soup.get_text()

	# Function to perform basic arithmetic operations
	def solve_math_problem():
	operations = ['+', '-', '*', '/']
	num1 = random.randint(1, 100)
	num2 = random.randint(1, 100)
	operation = random.choice(operations)

	problem = f"{num1} {operation} {num2}"

	# Safeguard division by zero
	if operation == '/':
	while num2 == 0:
	num2 = random.randint(1, 100)
	problem = f"{num1} {operation} {num2}"

	solution = eval(problem)
	return problem, solution

	# Load data or generate math problem
	if selected_topic != "Math":
	data = ""
	for url in topics[selected_topic]["urls"]:
	data += fetch_data(url, topics[selected_topic]["query"])
	else:
	# Create a dummy data string for tokenization and sequence generation
	data = "This is a sample text for math topic."

	# Tokenization
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts([data])
	sequence_data = tokenizer.texts_to_sequences([data])[0]

	# Creating sequences
	X = []
	Y = []
	for i in range(0, len(sequence_data) - 1):
	X.append(sequence_data[i:i+1])
	Y.append(sequence_data[i+1])

	# Padding sequences
	X = pad_sequences(X, maxlen=1)
	Y = np.array(Y)

	# Defining a lighter model
	model = Sequential()
	model.add(Embedding(len(tokenizer.word_index) + 1, 10, input_length=1))
	model.add(LSTM(10)) # Reduce LSTM size
	model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

	# Compiling the model
	model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	model.summary()

	# Training the model with fewer epochs
	model.fit(X, Y, epochs=10, verbose=1) # Reduced epochs

	# Function to generate text based on input
	def generate_text(model, tokenizer, max_sequence_len, input_text, num_words):
	for _ in range(num_words):
	token_list = tokenizer.texts_to_sequences([input_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
	if index == predicted:
	output_word = word
	break
	input_text += " " + output_word
	return input_text

	# Get initial input text and number of words to generate
	initial_input_text = "This is a generated text"
	num_words = 100 # Number of words to generate

	# Generate text
	generated_text = generate_text(model, tokenizer, 2, initial_input_text, num_words)

	# Append the math problem and solution to the generated text if the topic is math
	if selected_topic == "Math":
	math_problem, math_solution = solve_math_problem()
	final_text = f"{generated_text}\n\nMath Problem: {math_problem}\nSolution: {math_solution}"
	else:
	final_text = generated_text

	print(final_text)