Spaces:

valory
/

prediction_markets_ranking

Sleeping

File size: 4,004 Bytes

import pandas as pd
import gradio as gr
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# words to remove
months = [
    "january",
    "february",
    "march",
    "april",
    "may",
    "june",
    "july",
    "august",
    "september",
    "october",
    "november",
    "december",
]
years = ["2024", "2025"]
filter_words = []
filter_words.extend(months)
filter_words.extend(years)


def plot_top_10_ranking_by_nr_trades(market_metrics: pd.DataFrame) -> gr.Plot:
    market_metrics_sorted_by_trades = market_metrics.sort_values(
        by="nr_trades", ascending=False
    )
    top_10_markets = market_metrics_sorted_by_trades.head(10)

    # Create a hover text column that combines market and nr_trades
    top_10_markets["hover_text"] = (
        top_10_markets["title"]
        + "<br>Number of Traders: "
        + top_10_markets["total_traders"].astype(str)
    )

    fig = px.bar(
        top_10_markets,
        x="market_id",
        y="nr_trades",
        hover_data=["hover_text"],
        title="Ranking of Markets by Number of Trades",
    )

    fig.update_layout(
        xaxis_title="Markets",
        yaxis_title="Number of Trades",
        xaxis={"showticklabels": False},
    )

    return gr.Plot(
        value=fig,
    )


def plot_trades_and_traders_ranking(market_metrics: pd.DataFrame) -> gr.Plot:
    print("plotting trades and traders scatterplot")
    ranking_fig = px.scatter(
        market_metrics,
        x="total_traders",
        y="nr_trades",
        color="nr_trades",
        color_continuous_scale="viridis",
        custom_data=["title"],
    )

    ranking_fig.update_layout(
        xaxis_title="Total Number of Traders",
        yaxis_title="Total Number of Trades",
        width=1000,  # Adjusted for better fit on laptop screens
        height=600,  # Adjusted for better fit on laptop screens
        # margin=dict(l=50, r=50, t=70, b=50),  # Adjust margins for better spacing
    )
    ranking_fig.update_traces(
        hovertemplate="Title: %{customdata[0]}<br>"
        + "Nr trades: %{y}<br>"
        + "Total traders: %{x}<br>",
    )

    return gr.Plot(
        value=ranking_fig,
    )


def plot_wordcloud_topics(market_metrics: pd.DataFrame) -> gr.Plot:
    # Sort the data by 'nr_trades' in descending order
    market_metrics_sorted = market_metrics.sort_values(by="nr_trades", ascending=False)
    # Get the titles of the top 100 markets
    top_100_titles = market_metrics_sorted["title"].head(100)
    # Combine standard English stop words with custom filter words
    all_stop_words = list(set(ENGLISH_STOP_WORDS).union(filter_words))

    # Create and configure TF-IDF Vectorizer
    tfidf = TfidfVectorizer(
        stop_words=all_stop_words, max_features=100, max_df=0.95, min_df=1
    )
    # Fit and transform the titles
    tfidf_matrix = tfidf.fit_transform(top_100_titles)

    # Get feature names (terms)
    terms = tfidf.get_feature_names_out()
    # Calculate average TF-IDF scores for each term
    avg_scores = np.mean(tfidf_matrix.toarray(), axis=0)
    word_scores = dict(zip(terms, avg_scores))

    # Create and generate a word cloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        max_words=50,
        prefer_horizontal=0.7,
    ).generate_from_frequencies(word_scores)

    # Display the word cloud
    fig = plt.figure(figsize=(10, 5))
    ax = fig.add_subplot(111)

    # Plot wordcloud using the axes object
    ax.imshow(wordcloud, interpolation="bilinear")
    ax.axis("off")
    ax.set_title("Word Cloud of Market Titles")
    # plt.imshow(wordcloud, interpolation="bilinear")
    # plt.axis("off")
    # plt.title("Word Cloud of Market Titles")
    # # Close the figure to prevent memory leaks
    # plt.close()
    return gr.Plot(
        value=fig,
    )