Spaces:

Medon90ae
/

chalet_jornal

Sleeping

App Files Files Community

chalet_jornal / model_training.py

Medon90ae

Update model_training.py

18193a9 verified 3 months ago

raw

history blame contribute delete

12.4 kB

	import pandas as pd
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
	from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
	from datasets import Dataset
	import torch
	from huggingface_hub import notebook_login
	import os

	def login_to_huggingface():
	"""
	تسجيل الدخول إلى Hugging Face Hub
	"""
	try:
	notebook_login()
	return True
	except Exception as e:
	print(f"خطأ في تسجيل الدخول: {str(e)}")
	return False

	def prepare_sentiment_data(data_path):
	"""
	تجهيز البيانات لتدريب نموذج تحليل المشاعر
	"""
	try:
	df = pd.read_csv(data_path)

	# التأكد من وجود الأعمدة المطلوبة
	required_columns = ['text', 'label']
	if not all(col in df.columns for col in required_columns):
	print("البيانات لا تحتوي على الأعمدة المطلوبة (text, label)")
	return None

	# تحويل التسميات إلى أرقام
	label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
	df['label'] = df['label'].map(label_map)

	# إنشاء مجموعة بيانات Hugging Face
	dataset = Dataset.from_pandas(df)

	return dataset
	except Exception as e:
	print(f"خطأ في تجهيز البيانات: {str(e)}")
	return None

	def train_sentiment_model(dataset, model_name="aubmindlab/bert-base-arabertv2", epochs=3, batch_size=16, output_dir="./sentiment_model"):
	"""
	تدريب نموذج تحليل المشاعر
	"""
	try:
	# تجهيز النموذج والمُرمِّز
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

	# تجهيز البيانات
	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True)

	tokenized_dataset = dataset.map(tokenize_function, batched=True)

	# تقسيم البيانات إلى تدريب واختبار
	tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

	# إعداد معلمات التدريب
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=2e-5,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=epochs,
	weight_decay=0.01,
	save_strategy="epoch",
	push_to_hub=True,
	hub_model_id=f"{os.environ.get('HF_USERNAME', 'username')}/chalets-sentiment-model"
	)

	# إنشاء المدرب وبدء التدريب
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset["test"],
	tokenizer=tokenizer,
	)

	# التدريب
	trainer.train()

	# تقييم النموذج
	eval_results = trainer.evaluate()

	# حفظ النموذج
	trainer.save_model(output_dir)

	# رفع النموذج إلى Hugging Face Hub
	if os.environ.get('HF_USERNAME'):
	trainer.push_to_hub()

	return {
	"model_path": output_dir,
	"eval_results": eval_results
	}
	except Exception as e:
	print(f"خطأ في تدريب النموذج: {str(e)}")
	return None

	def prepare_content_generation_data(data_path):
	"""
	تجهيز البيانات لتدريب نموذج توليد المحتوى
	"""
	try:
	df = pd.read_csv(data_path)

	# التأكد من وجود الأعمدة المطلوبة
	required_columns = ['input_text', 'target_text']
	if not all(col in df.columns for col in required_columns):
	print("البيانات لا تحتوي على الأعمدة المطلوبة (input_text, target_text)")
	return None

	# إنشاء مجموعة بيانات Hugging Face
	dataset = Dataset.from_pandas(df)

	return dataset
	except Exception as e:
	print(f"خطأ في تجهيز البيانات: {str(e)}")
	return None

	def train_content_generation_model(dataset, model_name="aubmindlab/arabart-base", epochs=3, batch_size=8, output_dir="./content_model"):
	"""
	تدريب نموذج توليد المحتوى
	"""
	try:
	# تجهيز النموذج والمُرمِّز
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	# تجهيز البيانات
	def preprocess_function(examples):
	inputs = [ex for ex in examples["input_text"]]
	targets = [ex for ex in examples["target_text"]]
	model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

	with tokenizer.as_target_tokenizer():
	labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

	model_inputs["labels"] = labels["input_ids"]
	return model_inputs

	tokenized_dataset = dataset.map(preprocess_function, batched=True)

	# تقسيم البيانات إلى تدريب واختبار
	tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

	# إعداد معلمات التدريب
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=2e-5,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=epochs,
	weight_decay=0.01,
	save_strategy="epoch",
	push_to_hub=True,
	hub_model_id=f"{os.environ.get('HF_USERNAME', 'username')}/chalets-content-model"
	)

	# إنشاء المدرب وبدء التدريب
	data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset["test"],
	data_collator=data_collator,
	tokenizer=tokenizer,
	)

	# التدريب
	trainer.train()

	# تقييم النموذج
	eval_results = trainer.evaluate()

	# حفظ النموذج
	trainer.save_model(output_dir)

	# رفع النموذج إلى Hugging Face Hub
	if os.environ.get('HF_USERNAME'):
	trainer.push_to_hub()

	return {
	"model_path": output_dir,
	"eval_results": eval_results
	}
	except Exception as e:
	print(f"خطأ في تدريب النموذج: {str(e)}")
	return None

	def get_available_models():
	"""
	الحصول على قائمة النماذج المتاحة للتدريب
	"""
	models = [
	{
	"name": "نموذج تحليل المشاعر",
	"id": "sentiment",
	"description": "تحليل مشاعر التعليقات والتقييمات عن الشاليهات",
	"base_model": "aubmindlab/bert-base-arabertv2",
	"data_format": "text,label (positive/neutral/negative)"
	},
	{
	"name": "نموذج توليد المحتوى",
	"id": "content",
	"description": "توليد محتوى وصفي للشاليهات",
	"base_model": "aubmindlab/arabart-base",
	"data_format": "input_text,target_text"
	},
	{
	"name": "نموذج تلخيص المحتوى",
	"id": "summarization",
	"description": "تلخيص المحتوى الطويل عن الشاليهات",
	"base_model": "facebook/bart-large-cnn",
	"data_format": "text,summary"
	}
	]
	return models

	def create_training_data_from_chalets(chalets_df, model_type):
	"""
	إنشاء بيانات تدريب من بيانات الشاليهات
	"""
	try:
	if chalets_df.empty:
	print("بيانات الشاليهات فارغة")
	return None

	if model_type == "sentiment":
	# إنشاء بيانات تدريب لنموذج تحليل المشاعر
	training_data = []

	for _, row in chalets_df.iterrows():
	description = row.get('description', '')
	if description:
	# تقدير التسمية بناءً على وصف الشاليه
	label = "positive" # افتراضي

	# كلمات إيجابية وسلبية للتصنيف البسيط
	positive_words = ["رائع", "جميل", "ممتاز", "فاخر", "مريح", "متميز", "مثالي"]
	negative_words = ["سيء", "رديء", "مزعج", "مكلف", "صغير", "قديم"]

	positive_count = sum(1 for word in positive_words if word in description.lower())
	negative_count = sum(1 for word in negative_words if word in description.lower())

	if negative_count > positive_count:
	label = "negative"
	elif positive_count == negative_count:
	label = "neutral"

	training_data.append({
	"text": description,
	"label": label
	})

	return pd.DataFrame(training_data)

	elif model_type == "content":
	# إنشاء بيانات تدريب لنموذج توليد المحتوى
	training_data = []

	for _, row in chalets_df.iterrows():
	name = row.get('name', '')
	location = row.get('location', '')
	description = row.get('description', '')

	if name and location and description:
	input_text = f"شاليه {name} في {location}"
	target_text = description

	training_data.append({
	"input_text": input_text,
	"target_text": target_text
	})

	return pd.DataFrame(training_data)

	elif model_type == "summarization":
	# إنشاء بيانات تدريب لنموذج تلخيص المحتوى
	training_data = []

	for _, row in chalets_df.iterrows():
	description = row.get('description', '')

	if description and len(description.split()) > 30:
	# استخدام الجملة الأولى كملخص
	sentences = description.split('.')
	summary = sentences[0] if sentences else ""

	training_data.append({
	"text": description,
	"summary": summary
	})

	return pd.DataFrame(training_data)

	else:
	print(f"نوع النموذج غير معروف: {model_type}")
	return None

	except Exception as e:
	print(f"خطأ في إنشاء بيانات التدريب: {str(e)}")
	return None

	def save_training_data(df, filename):
	"""
	حفظ بيانات التدريب في ملف CSV
	"""
	try:
	df.to_csv(filename, index=False)
	return True
	except Exception as e:
	print(f"خطأ في حفظ بيانات التدريب: {str(e)}")
	return False