Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from datasets import Dataset | |
import os | |
import json | |
import re | |
from collections import Counter | |
from urllib.parse import quote | |
import requests | |
from bs4 import BeautifulSoup | |
def load_tourism_datasets(): | |
try: | |
if os.path.exists("real_chalets_data.csv"): | |
df = pd.read_csv("real_chalets_data.csv") | |
dataset = Dataset.from_pandas(df) | |
return dataset | |
else: | |
chalets_data = scrape_chalets_from_web() | |
if chalets_data: | |
df = pd.DataFrame(chalets_data) | |
df.to_csv("real_chalets_data.csv", index=False) | |
dataset = Dataset.from_pandas(df) | |
return dataset | |
else: | |
print("لم يتم العثور على بيانات للشاليهات") | |
return Dataset.from_pandas(pd.DataFrame()) | |
except Exception as e: | |
print(f"خطأ في تحميل مجموعة البيانات: {str(e)}") | |
return Dataset.from_pandas(pd.DataFrame()) | |
def scrape_chalets_from_web(): | |
chalets = [] | |
locations = ["العين السخنة", "الساحل الشمالي", "شرم الشيخ", "الغردقة", "رأس سدر"] | |
for location in locations: | |
encoded_location = quote(location) | |
url = f"https://aqarmap.com.eg/ar/for-rent/chalet/{encoded_location}/" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
} | |
response = requests.get(url, headers=headers) | |
soup = BeautifulSoup(response.content, "html.parser") | |
listings = soup.find_all("div", class_="listing-card") | |
for listing in listings: | |
try: | |
# استخراج البيانات من كل إعلان | |
# (الكود الخاص باستخراج البيانات من كل إعلان يبقى كما هو) | |
chalet = { | |
'name': title, | |
'location': location, | |
'price': price, | |
'capacity': capacity, | |
'rating': rating, | |
'amenities': amenities_str, | |
'season': season, | |
'target_audience': target_audience, | |
'description': description, | |
'image_url': image_url | |
} | |
chalets.append(chalet) | |
except Exception as e: | |
print(f"خطأ في استخراج بيانات الشاليه: {str(e)}") | |
return chalets | |
def filter_dataset_by_criteria(dataset, criteria): | |
filtered_dataset = dataset | |
for key, value in criteria.items(): | |
if value: | |
filtered_dataset = filtered_dataset.filter(lambda example: example[key] == value) | |
return filtered_dataset | |
def analyze_dataset(dataset): | |
df = dataset.to_pandas() | |
if df.empty: | |
return { | |
'count': 0, | |
'locations': {}, | |
'avg_price': 0, | |
'avg_rating': 0, | |
'audience_distribution': {}, | |
'season_distribution': {} | |
} | |
analysis = { | |
'count': len(df), | |
'locations': df['location'].value_counts().to_dict() if 'location' in df.columns else {}, | |
'avg_price': df['price'].mean() if 'price' in df.columns else 0, | |
'avg_rating': df['rating'].mean() if 'rating' in df.columns else 0, | |
'audience_distribution': df['target_audience'].value_counts().to_dict() if 'target_audience' in df.columns else {}, | |
'season_distribution': df['season'].value_counts().to_dict() if 'season' in df.columns else {} | |
} | |
return analysis | |
def extract_keywords_from_dataset(dataset, column='description'): | |
df = dataset.to_pandas() | |
if df.empty or column not in df.columns: | |
return [] | |
all_text = " ".join(df[column].astype(str).tolist()) | |
all_text = re.sub(r'[^\w\s]', ' ', all_text) | |
words = re.findall(r'\b\w+\b', all_text.lower()) | |
common_words = ["في", "على", "من", "إلى", "عن", "مع", "هذا", "هذه", "ذلك", "تلك", "و", "ب", | |
"ال", "هو", "هي", "نحن", "هم", "انت", "انتم", "كان", "كانت", "يكون", "تكون"] | |
filtered_words = [word for word in words if word not in common_words and len(word) > 2] | |
word_counts = Counter(filtered_words) | |
sorted_keywords = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) | |
return [{"word": word, "count": count} for word, count in sorted_keywords[:20]] | |
def save_dataset_to_csv(dataset, filename): | |
try: | |
df = dataset.to_pandas() | |
df.to_csv(filename, index=False) | |
return True | |
except Exception as e: | |
print(f"خطأ في حفظ مجموعة البيانات: {str(e)}") | |
return False | |
def load_dataset_from_csv(filename): | |
try: | |
if os.path.exists(filename): | |
df = pd.read_csv(filename) | |
dataset = Dataset.from_pandas(df) | |
return dataset | |
else: | |
print(f"الملف {filename} غير موجود") | |
return None | |
except Exception as e: | |
print(f"خطأ في تحميل مجموعة البيانات: {str(e)}") | |
return None | |
def merge_datasets(datasets): | |
try: | |
dfs = [dataset.to_pandas() for dataset in datasets if dataset is not None] | |
if dfs: | |
merged_df = pd.concat(dfs, ignore_index=True) | |
merged_dataset = Dataset.from_pandas(merged_df) | |
return merged_dataset | |
else: | |
return None | |
except Exception as e: | |
print(f"خطأ في دمج مجموعات البيانات: {str(e)}") | |
return None | |