Spaces:

ssbars
/

ysdaml4

Running

ysdaml4 / app.py

12faaae about 2 months ago

5.87 kB

	import streamlit as st

	# Set up the Streamlit page - this must be the first st command
	st.set_page_config(
	page_title="Paper Classification Service",
	page_icon="📚",
	layout="wide"
	)

	import PyPDF2
	import io
	from model import PaperClassifier

	# Initialize the classifier with model selection
	@st.cache_resource
	def load_classifier(model_type):
	return PaperClassifier(model_type)

	# Cache the PDF text extraction
	@st.cache_data
	def extract_pdf_text(pdf_bytes):
	"""Extract text from PDF and try to separate title and abstract"""
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"

	# Try to extract title and abstract
	lines = text.split('\n')
	title = lines[0] if lines else ""
	abstract = "\n".join(lines[1:]) if len(lines) > 1 else ""

	return title.strip(), abstract.strip()

	# Get available models for selection
	available_models = list(PaperClassifier.AVAILABLE_MODELS.keys())

	# Add model selection to sidebar
	st.sidebar.title("Model Settings")
	selected_model = st.sidebar.selectbox(
	"Select Model",
	available_models,
	index=0,
	help="Choose the model to use for classification"
	)

	# Display model information
	model_info = PaperClassifier.AVAILABLE_MODELS[selected_model]
	st.sidebar.markdown(f"""
	### Selected Model
	Name: {model_info['name']}
	Description: {model_info['description']}
	""")

	# Initialize the classifier with selected model
	classifier = load_classifier(selected_model)

	# Title and description
	st.title("📚 Academic Paper Classification")
	st.markdown("""
	This service helps you classify academic papers into different categories.
	You can either:
	- Enter the paper's title and abstract separately
	- Upload a PDF file
	""")

	# Create two columns for input methods
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Option 1: Manual Input")

	# Title input
	title_input = st.text_input(
	"Paper Title:",
	placeholder="Enter the paper title..."
	)

	# Abstract input
	abstract_input = st.text_area(
	"Paper Abstract (optional):",
	height=200,
	placeholder="Enter the paper abstract (optional)..."
	)

	if st.button("Classify Paper"):
	if title_input.strip():
	with st.spinner("Classifying..."):
	result = classifier.classify_paper(
	title=title_input,
	abstract=abstract_input if abstract_input.strip() else None
	)

	st.success("Classification Complete!")
	st.write(f"Input Type: {result['input_type'].replace('_', ' ').title()}")
	st.write(f"Model Used: {result['model_used']}")

	# Show top categories
	st.subheader("Top Categories (95% Confidence)")
	total_prob = 0
	for cat_info in result['top_categories']:
	prob = cat_info['probability']
	total_prob += prob
	st.progress(prob, text=f"{cat_info['category']} ({cat_info['arxiv_category']}): {prob:.1%}")

	st.info(f"Total probability of shown categories: {total_prob:.1%}")
	else:
	st.warning("Please enter at least the paper title.")

	with col2:
	st.subheader("Option 2: PDF Upload")
	uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

	if uploaded_file is not None:
	if st.button("Classify PDF"):
	try:
	with st.spinner("Processing PDF..."):
	# Extract title and abstract from PDF
	title, abstract = extract_pdf_text(uploaded_file.read())

	if not title:
	st.error("Could not extract title from PDF.")
	st.stop()

	# Show extracted text
	with st.expander("Show extracted text"):
	st.write("Extracted Title:")
	st.write(title)
	if abstract:
	st.write("Extracted Abstract:")
	st.write(abstract)

	# Classify the paper
	result = classifier.classify_paper(
	title=title,
	abstract=abstract if abstract else None
	)

	st.success("Classification Complete!")
	st.write(f"Input Type: {result['input_type'].replace('_', ' ').title()}")
	st.write(f"Model Used: {result['model_used']}")

	# Show top categories
	st.subheader("Top Categories (95% Confidence)")
	total_prob = 0
	for cat_info in result['top_categories']:
	prob = cat_info['probability']
	total_prob += prob
	st.progress(prob, text=f"{cat_info['category']} ({cat_info['arxiv_category']}): {prob:.1%}")

	st.info(f"Total probability of shown categories: {total_prob:.1%}")
	except Exception as e:
	st.error(f"Error processing PDF: {str(e)}")

	# Add information about the models
	st.sidebar.markdown("---")
	st.sidebar.title("Available Models")
	st.sidebar.markdown("""
	- DistilBERT: Fast and lightweight
	- DeBERTa v3: Advanced performance
	- T5: Versatile text-to-text
	- RoBERTa: Strong performance
	- SciBERT: Specialized for science
	""")

	# Add footer
	st.markdown("---")
	st.markdown("Made with ❤️ using Streamlit and Transformers")