chalet_jornal / data_management.py
Medon90ae's picture
Update data_management.py
b4b4d32 verified
import pandas as pd
import numpy as np
from datasets import Dataset
import os
import json
import re
from collections import Counter
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
def load_tourism_datasets():
try:
if os.path.exists("real_chalets_data.csv"):
df = pd.read_csv("real_chalets_data.csv")
dataset = Dataset.from_pandas(df)
return dataset
else:
chalets_data = scrape_chalets_from_web()
if chalets_data:
df = pd.DataFrame(chalets_data)
df.to_csv("real_chalets_data.csv", index=False)
dataset = Dataset.from_pandas(df)
return dataset
else:
print("لم يتم العثور على بيانات للشاليهات")
return Dataset.from_pandas(pd.DataFrame())
except Exception as e:
print(f"خطأ في تحميل مجموعة البيانات: {str(e)}")
return Dataset.from_pandas(pd.DataFrame())
def scrape_chalets_from_web():
chalets = []
locations = ["العين السخنة", "الساحل الشمالي", "شرم الشيخ", "الغردقة", "رأس سدر"]
for location in locations:
encoded_location = quote(location)
url = f"https://aqarmap.com.eg/ar/for-rent/chalet/{encoded_location}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
listings = soup.find_all("div", class_="listing-card")
for listing in listings:
try:
# استخراج البيانات من كل إعلان
# (الكود الخاص باستخراج البيانات من كل إعلان يبقى كما هو)
chalet = {
'name': title,
'location': location,
'price': price,
'capacity': capacity,
'rating': rating,
'amenities': amenities_str,
'season': season,
'target_audience': target_audience,
'description': description,
'image_url': image_url
}
chalets.append(chalet)
except Exception as e:
print(f"خطأ في استخراج بيانات الشاليه: {str(e)}")
return chalets
def filter_dataset_by_criteria(dataset, criteria):
filtered_dataset = dataset
for key, value in criteria.items():
if value:
filtered_dataset = filtered_dataset.filter(lambda example: example[key] == value)
return filtered_dataset
def analyze_dataset(dataset):
df = dataset.to_pandas()
if df.empty:
return {
'count': 0,
'locations': {},
'avg_price': 0,
'avg_rating': 0,
'audience_distribution': {},
'season_distribution': {}
}
analysis = {
'count': len(df),
'locations': df['location'].value_counts().to_dict() if 'location' in df.columns else {},
'avg_price': df['price'].mean() if 'price' in df.columns else 0,
'avg_rating': df['rating'].mean() if 'rating' in df.columns else 0,
'audience_distribution': df['target_audience'].value_counts().to_dict() if 'target_audience' in df.columns else {},
'season_distribution': df['season'].value_counts().to_dict() if 'season' in df.columns else {}
}
return analysis
def extract_keywords_from_dataset(dataset, column='description'):
df = dataset.to_pandas()
if df.empty or column not in df.columns:
return []
all_text = " ".join(df[column].astype(str).tolist())
all_text = re.sub(r'[^\w\s]', ' ', all_text)
words = re.findall(r'\b\w+\b', all_text.lower())
common_words = ["في", "على", "من", "إلى", "عن", "مع", "هذا", "هذه", "ذلك", "تلك", "و", "ب",
"ال", "هو", "هي", "نحن", "هم", "انت", "انتم", "كان", "كانت", "يكون", "تكون"]
filtered_words = [word for word in words if word not in common_words and len(word) > 2]
word_counts = Counter(filtered_words)
sorted_keywords = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
return [{"word": word, "count": count} for word, count in sorted_keywords[:20]]
def save_dataset_to_csv(dataset, filename):
try:
df = dataset.to_pandas()
df.to_csv(filename, index=False)
return True
except Exception as e:
print(f"خطأ في حفظ مجموعة البيانات: {str(e)}")
return False
def load_dataset_from_csv(filename):
try:
if os.path.exists(filename):
df = pd.read_csv(filename)
dataset = Dataset.from_pandas(df)
return dataset
else:
print(f"الملف {filename} غير موجود")
return None
except Exception as e:
print(f"خطأ في تحميل مجموعة البيانات: {str(e)}")
return None
def merge_datasets(datasets):
try:
dfs = [dataset.to_pandas() for dataset in datasets if dataset is not None]
if dfs:
merged_df = pd.concat(dfs, ignore_index=True)
merged_dataset = Dataset.from_pandas(merged_df)
return merged_dataset
else:
return None
except Exception as e:
print(f"خطأ في دمج مجموعات البيانات: {str(e)}")
return None