{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error reading the parquet file\n" ] } ], "source": [ "try:\n", " markets = pd.read_parquet(\"../data/fpmms.parquet\")\n", "except Exception:\n", " print(\"Error reading the parquet file\")\n", "\n", "# markets[\"currentAnswer\"] = markets[\"currentAnswer\"].apply(lambda x: x.lower())\n", "# # filter only markets with yes, no answers\n", "# valid_answers = [\"yes\", \"no\"]\n", "# markets = markets.loc[markets[\"currentAnswer\"].isin(valid_answers)]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "try:\n", " markets_df = pd.read_parquet(\"../data/fpmmTrades.parquet\")\n", "except Exception:\n", " print(\"Error reading the parquet file\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 85876 entries, 0 to 85875\n", "Data columns (total 24 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 collateralAmount 85876 non-null object\n", " 1 collateralAmountUSD 85876 non-null object\n", " 2 collateralToken 85876 non-null object\n", " 3 creationTimestamp 85876 non-null object\n", " 4 trader_address 85876 non-null object\n", " 5 feeAmount 85876 non-null object\n", " 6 id 85876 non-null object\n", " 7 oldOutcomeTokenMarginalPrice 85876 non-null object\n", " 8 outcomeIndex 85876 non-null object\n", " 9 outcomeTokenMarginalPrice 85876 non-null object\n", " 10 outcomeTokensTraded 85876 non-null object\n", " 11 title 85876 non-null object\n", " 12 transactionHash 85876 non-null object\n", " 13 type 85876 non-null object\n", " 14 market_creator 85876 non-null object\n", " 15 fpmm.answerFinalizedTimestamp 54399 non-null object\n", " 16 fpmm.arbitrationOccurred 85876 non-null bool \n", " 17 fpmm.currentAnswer 54399 non-null object\n", " 18 fpmm.id 85876 non-null object\n", " 19 fpmm.isPendingArbitration 85876 non-null bool \n", " 20 fpmm.openingTimestamp 85876 non-null object\n", " 21 fpmm.outcomes 85876 non-null object\n", " 22 fpmm.title 85876 non-null object\n", " 23 fpmm.condition.id 85876 non-null object\n", "dtypes: bool(2), object(22)\n", "memory usage: 14.6+ MB\n" ] } ], "source": [ "markets_df.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['collateralAmount', 'collateralAmountUSD', 'collateralToken',\n", " 'creationTimestamp', 'trader_address', 'feeAmount', 'id',\n", " 'oldOutcomeTokenMarginalPrice', 'outcomeIndex',\n", " 'outcomeTokenMarginalPrice', 'outcomeTokensTraded', 'title',\n", " 'transactionHash', 'type', 'market_creator',\n", " 'fpmm.answerFinalizedTimestamp', 'fpmm.arbitrationOccurred',\n", " 'fpmm.currentAnswer', 'fpmm.id', 'fpmm.isPendingArbitration',\n", " 'fpmm.openingTimestamp', 'fpmm.outcomes', 'fpmm.title',\n", " 'fpmm.condition.id'],\n", " dtype='object')" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets_df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(markets.id.unique())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridtitlemarket_creator
0no0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5Will the first floating offshore wind research...quickstart
1no0x0020d13c89140b47e10db54cbd53852b90bc1391Will the Francis Scott Key Bridge in Baltimore...quickstart
2no0x003ae5e007cc38b3f86b0ed7c82f938a1285ac07Will FC Saarbrucken reach the final of the Ger...quickstart
3yes0x004c8d4c619dc6b9caa940f5ea7ef699ae85359cWill the pro-life activists convicted for 'con...quickstart
4yes0x005e3f7a90585acbec807425a750fbba1d0c2b5cWill Apple announce the release of a new M4 ch...quickstart
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "0 no 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 \n", "1 no 0x0020d13c89140b47e10db54cbd53852b90bc1391 \n", "2 no 0x003ae5e007cc38b3f86b0ed7c82f938a1285ac07 \n", "3 yes 0x004c8d4c619dc6b9caa940f5ea7ef699ae85359c \n", "4 yes 0x005e3f7a90585acbec807425a750fbba1d0c2b5c \n", "\n", " title market_creator \n", "0 Will the first floating offshore wind research... quickstart \n", "1 Will the Francis Scott Key Bridge in Baltimore... quickstart \n", "2 Will FC Saarbrucken reach the final of the Ger... quickstart \n", "3 Will the pro-life activists convicted for 'con... quickstart \n", "4 Will Apple announce the release of a new M4 ch... quickstart " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "trades = pd.read_parquet(\"../data/fpmmTrades.parquet\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
collateralAmountcollateralAmountUSDcollateralTokencreationTimestamptrader_addressfeeAmountidoldOutcomeTokenMarginalPriceoutcomeIndexoutcomeTokenMarginalPrice...market_creatorfpmm.answerFinalizedTimestampfpmm.arbitrationOccurredfpmm.currentAnswerfpmm.idfpmm.isPendingArbitrationfpmm.openingTimestampfpmm.outcomesfpmm.titlefpmm.condition.id
09305967650456174080.93059779934117533864348280336664730xe91d153e0b41518a2ce8dd3d7944fa863463a97d17285966050x01274796ce41aa8e8312e05a427ffb4b0d2148f693059676504561740x007068173910cf8719b6f2e66a18b6825c9dde820x01...0.558111979762980196833836180241856400.611825749650855211231211687533889...quickstart1728822710False0x00000000000000000000000000000000000000000000...0x007068173910cf8719b6f2e66a18b6825c9dde82False1728691200[Yes, No]Will the emergency public warning tests planne...0xa610166e379c42404bd27bf12a16119fdb5171990c3e...
110332472347961938001.0332501260033394937910329936745250xe91d153e0b41518a2ce8dd3d7944fa863463a97d17285055750x034c4ad84f7ac6638bf19300d5bbe7d9b981e736103324723479619380x007068173910cf8719b6f2e66a18b6825c9dde820x03...0.660208990298303445124446130809070700.7034159692833852946883644485233207...quickstart1728822710False0x00000000000000000000000000000000000000000000...0x007068173910cf8719b6f2e66a18b6825c9dde82False1728691200[Yes, No]Will the emergency public warning tests planne...0xa610166e379c42404bd27bf12a16119fdb5171990c3e...
212066923688428983001.2066915962481879683670637170788840xe91d153e0b41518a2ce8dd3d7944fa863463a97d17285628950x05e8bbdb89c84a14d05194bbbae81caf2340db72120669236884289830x007068173910cf8719b6f2e66a18b6825c9dde820x05...0.193145918304372186430986421068454610.3033804066591317111055858533563476...quickstart1728822710False0x00000000000000000000000000000000000000000000...0x007068173910cf8719b6f2e66a18b6825c9dde82False1728691200[Yes, No]Will the emergency public warning tests planne...0xa610166e379c42404bd27bf12a16119fdb5171990c3e...
39305982032745443840.93059923757170080912179287297934220xe91d153e0b41518a2ce8dd3d7944fa863463a97d17285966450x17c17ca981b7e244d0bad80b632a082dc1db36e593059820327454430x007068173910cf8719b6f2e66a18b6825c9dde820x17...0.61182574965085521123121168753388900.6579972404391247884756597316198778...quickstart1728822710False0x00000000000000000000000000000000000000000000...0x007068173910cf8719b6f2e66a18b6825c9dde82False1728691200[Yes, No]Will the emergency public warning tests planne...0xa610166e379c42404bd27bf12a16119fdb5171990c3e...
417986959651029184001.7986967959313423139361257822752250xe91d153e0b41518a2ce8dd3d7944fa863463a97d17283377800x1d942103400c1f1657dcbffd5e08904787ea936b179869596510291840x007068173910cf8719b6f2e66a18b6825c9dde820x1d...0.763615736941978768175557728675570300.8080447772492735383356100969932859...quickstart1728822710False0x00000000000000000000000000000000000000000000...0x007068173910cf8719b6f2e66a18b6825c9dde82False1728691200[Yes, No]Will the emergency public warning tests planne...0xa610166e379c42404bd27bf12a16119fdb5171990c3e...
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " collateralAmount collateralAmountUSD \\\n", "0 930596765045617408 0.9305977993411753386434828033666473 \n", "1 1033247234796193800 1.033250126003339493791032993674525 \n", "2 1206692368842898300 1.206691596248187968367063717078884 \n", "3 930598203274544384 0.9305992375717008091217928729793422 \n", "4 1798695965102918400 1.798696795931342313936125782275225 \n", "\n", " collateralToken creationTimestamp \\\n", "0 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1728596605 \n", "1 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1728505575 \n", "2 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1728562895 \n", "3 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1728596645 \n", "4 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1728337780 \n", "\n", " trader_address feeAmount \\\n", "0 0x01274796ce41aa8e8312e05a427ffb4b0d2148f6 9305967650456174 \n", "1 0x034c4ad84f7ac6638bf19300d5bbe7d9b981e736 10332472347961938 \n", "2 0x05e8bbdb89c84a14d05194bbbae81caf2340db72 12066923688428983 \n", "3 0x17c17ca981b7e244d0bad80b632a082dc1db36e5 9305982032745443 \n", "4 0x1d942103400c1f1657dcbffd5e08904787ea936b 17986959651029184 \n", "\n", " id \\\n", "0 0x007068173910cf8719b6f2e66a18b6825c9dde820x01... \n", "1 0x007068173910cf8719b6f2e66a18b6825c9dde820x03... \n", "2 0x007068173910cf8719b6f2e66a18b6825c9dde820x05... \n", "3 0x007068173910cf8719b6f2e66a18b6825c9dde820x17... \n", "4 0x007068173910cf8719b6f2e66a18b6825c9dde820x1d... \n", "\n", " oldOutcomeTokenMarginalPrice outcomeIndex \\\n", "0 0.5581119797629801968338361802418564 0 \n", "1 0.6602089902983034451244461308090707 0 \n", "2 0.1931459183043721864309864210684546 1 \n", "3 0.611825749650855211231211687533889 0 \n", "4 0.7636157369419787681755577286755703 0 \n", "\n", " outcomeTokenMarginalPrice ... market_creator \\\n", "0 0.611825749650855211231211687533889 ... quickstart \n", "1 0.7034159692833852946883644485233207 ... quickstart \n", "2 0.3033804066591317111055858533563476 ... quickstart \n", "3 0.6579972404391247884756597316198778 ... quickstart \n", "4 0.8080447772492735383356100969932859 ... quickstart \n", "\n", " fpmm.answerFinalizedTimestamp fpmm.arbitrationOccurred \\\n", "0 1728822710 False \n", "1 1728822710 False \n", "2 1728822710 False \n", "3 1728822710 False \n", "4 1728822710 False \n", "\n", " fpmm.currentAnswer \\\n", "0 0x00000000000000000000000000000000000000000000... \n", "1 0x00000000000000000000000000000000000000000000... \n", "2 0x00000000000000000000000000000000000000000000... \n", "3 0x00000000000000000000000000000000000000000000... \n", "4 0x00000000000000000000000000000000000000000000... \n", "\n", " fpmm.id fpmm.isPendingArbitration \\\n", "0 0x007068173910cf8719b6f2e66a18b6825c9dde82 False \n", "1 0x007068173910cf8719b6f2e66a18b6825c9dde82 False \n", "2 0x007068173910cf8719b6f2e66a18b6825c9dde82 False \n", "3 0x007068173910cf8719b6f2e66a18b6825c9dde82 False \n", "4 0x007068173910cf8719b6f2e66a18b6825c9dde82 False \n", "\n", " fpmm.openingTimestamp fpmm.outcomes \\\n", "0 1728691200 [Yes, No] \n", "1 1728691200 [Yes, No] \n", "2 1728691200 [Yes, No] \n", "3 1728691200 [Yes, No] \n", "4 1728691200 [Yes, No] \n", "\n", " fpmm.title \\\n", "0 Will the emergency public warning tests planne... \n", "1 Will the emergency public warning tests planne... \n", "2 Will the emergency public warning tests planne... \n", "3 Will the emergency public warning tests planne... \n", "4 Will the emergency public warning tests planne... \n", "\n", " fpmm.condition.id \n", "0 0xa610166e379c42404bd27bf12a16119fdb5171990c3e... \n", "1 0xa610166e379c42404bd27bf12a16119fdb5171990c3e... \n", "2 0xa610166e379c42404bd27bf12a16119fdb5171990c3e... \n", "3 0xa610166e379c42404bd27bf12a16119fdb5171990c3e... \n", "4 0xa610166e379c42404bd27bf12a16119fdb5171990c3e... \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trades.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 26835 entries, 0 to 26834\n", "Data columns (total 24 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 collateralAmount 26835 non-null object\n", " 1 collateralAmountUSD 26835 non-null object\n", " 2 collateralToken 26835 non-null object\n", " 3 creationTimestamp 26835 non-null object\n", " 4 trader_address 26835 non-null object\n", " 5 feeAmount 26835 non-null object\n", " 6 id 26835 non-null object\n", " 7 oldOutcomeTokenMarginalPrice 26835 non-null object\n", " 8 outcomeIndex 26835 non-null object\n", " 9 outcomeTokenMarginalPrice 26835 non-null object\n", " 10 outcomeTokensTraded 26835 non-null object\n", " 11 title 26835 non-null object\n", " 12 transactionHash 26835 non-null object\n", " 13 type 26835 non-null object\n", " 14 market_creator 26835 non-null object\n", " 15 fpmm.answerFinalizedTimestamp 24829 non-null object\n", " 16 fpmm.arbitrationOccurred 26835 non-null bool \n", " 17 fpmm.currentAnswer 24829 non-null object\n", " 18 fpmm.id 26835 non-null object\n", " 19 fpmm.isPendingArbitration 26835 non-null bool \n", " 20 fpmm.openingTimestamp 26835 non-null object\n", " 21 fpmm.outcomes 26835 non-null object\n", " 22 fpmm.title 26835 non-null object\n", " 23 fpmm.condition.id 26835 non-null object\n", "dtypes: bool(2), object(22)\n", "memory usage: 4.6+ MB\n" ] } ], "source": [ "trades.info()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['collateralAmount', 'collateralAmountUSD', 'collateralToken',\n", " 'creationTimestamp', 'trader_address', 'feeAmount', 'id',\n", " 'oldOutcomeTokenMarginalPrice', 'outcomeIndex',\n", " 'outcomeTokenMarginalPrice', 'outcomeTokensTraded', 'title',\n", " 'transactionHash', 'type', 'market_creator',\n", " 'fpmm.answerFinalizedTimestamp', 'fpmm.arbitrationOccurred',\n", " 'fpmm.currentAnswer', 'fpmm.id', 'fpmm.isPendingArbitration',\n", " 'fpmm.openingTimestamp', 'fpmm.outcomes', 'fpmm.title',\n", " 'fpmm.condition.id'],\n", " dtype='object')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trades.columns" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "markets = list(trades[\"fpmm.id\"].unique())" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "803" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(markets)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gp/02mb1d514ng739czlxw1lhh00000gn/T/ipykernel_3094/2495807215.py:12: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " trade_markets.rename(\n" ] } ], "source": [ "from datetime import datetime\n", "INVALID_ANSWER_HEX = (\n", " \"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff\"\n", ")\n", "columns_of_interest = [\n", " \"fpmm.currentAnswer\",\n", " \"fpmm.id\",\n", " \"fpmm.openingTimestamp\",\n", " \"market_creator\",\n", " ]\n", "trade_markets = trades[columns_of_interest]\n", "trade_markets.rename(\n", " columns={\n", " \"fpmm.currentAnswer\": \"currentAnswer\",\n", " \"fpmm.openingTimestamp\": \"openingTimestamp\",\n", " \"fpmm.id\": \"id\",\n", " },\n", " inplace=True,\n", ")\n", "trade_markets = trade_markets.drop_duplicates(subset=['id'], keep='last')\n", "# remove invalid answers\n", "\n", "trade_markets = trade_markets.loc[trade_markets[\"currentAnswer\"]!= INVALID_ANSWER_HEX]\n", "trade_markets[\"currentAnswer\"] = trade_markets[\"currentAnswer\"].apply(\n", " lambda x: convert_hex_to_int(x)\n", ")\n", "trade_markets[\"opening_datetime\"] = trade_markets[\"openingTimestamp\"].apply(\n", " lambda x: datetime.fromtimestamp(int(x))\n", ")\n", "trade_markets = trade_markets.sort_values(by=\"opening_datetime\", ascending=True)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets.id.unique())" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trade_markets.tail()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "719" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "trade_markets.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetime
207921.00xcc9c26a86dd55aa04dcb0066c9b8fca2983f407d1727136000quickstart2024-09-24 02:00:00
211301.00xd1bd18d7601d106639f922f1b5d2eda025c26be71727136000quickstart2024-09-24 02:00:00
74940.00x4eba0ec2464ec7c746e8872078165c8ad52d346f1727136000quickstart2024-09-24 02:00:00
99111.00x61065f131e2ec851c40765bb0b078a318a36f53e1727136000quickstart2024-09-24 02:00:00
261820.00x7e191324f0efb8aa20b8c702d95e812e55b4179c1727136000pearl2024-09-24 02:00:00
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "20792 1.0 0xcc9c26a86dd55aa04dcb0066c9b8fca2983f407d \n", "21130 1.0 0xd1bd18d7601d106639f922f1b5d2eda025c26be7 \n", "7494 0.0 0x4eba0ec2464ec7c746e8872078165c8ad52d346f \n", "9911 1.0 0x61065f131e2ec851c40765bb0b078a318a36f53e \n", "26182 0.0 0x7e191324f0efb8aa20b8c702d95e812e55b4179c \n", "\n", " openingTimestamp market_creator opening_datetime \n", "20792 1727136000 quickstart 2024-09-24 02:00:00 \n", "21130 1727136000 quickstart 2024-09-24 02:00:00 \n", "7494 1727136000 quickstart 2024-09-24 02:00:00 \n", "9911 1727136000 quickstart 2024-09-24 02:00:00 \n", "26182 1727136000 pearl 2024-09-24 02:00:00 " ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trade_markets.tail()" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "import math\n", "def market_KL_divergence(market_row: pd.DataFrame) -> float:\n", " \"\"\"Function to compute the divergence based on the formula\n", " Formula in https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence\"\"\"\n", " current_answer = market_row.currentAnswer # \"yes\", \"no\"\n", " target_prob = 1 # = 100%\n", " if current_answer == \"yes\":\n", " candidate_prob = market_row.first_outcome_prob\n", " else: # \"no\"\n", " candidate_prob = market_row.second_outcome_prob\n", "\n", " # we have only one sample, the final probability based on tokens\n", " kl_divergence = candidate_prob * round(math.log(candidate_prob / target_prob), 4)\n", " return kl_divergence" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "719" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(list(trade_markets.id.unique()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trade_markets.currentAnswer.value_counts()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "INVALID_ANSWER_HEX = (\n", " \"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff\"\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "def convert_hex_to_int(x):\n", " \"\"\"Convert hex to int\"\"\"\n", " if isinstance(x, float):\n", " return np.nan\n", " if isinstance(x, str):\n", " if x == INVALID_ANSWER_HEX:\n", " return -1\n", " answer = int(x, 16)\n", " return answer\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "market_ids = list(markets.id.unique())\n", "for i in range(len(trade_markets)):\n", " market = trade_markets.iloc[i]\n", " if market.id in market_ids:\n", " current_answer = convert_hex_to_int(market.currentAnswer)\n", " market_answer = markets.loc[markets[\"id\"]==market.id].currentAnswer.values[0]\n", " print(f\"current answer = {current_answer} and market answer {market_answer}\")\n", " trade_markets.at[i, \"currentAnswer\"] = market_answer" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "markets[\"currentAnswer\"] = markets[\"currentAnswer\"].apply(lambda x: convert_hex_to_int(x))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "currentAnswer\n", " 1.0 407\n", " 0.0 241\n", "-1.0 84\n", "Name: count, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets.currentAnswer.value_counts()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0769610411361284" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import math\n", "\n", "candidate_prob = 9/25\n", "target_prob = 1/3\n", "math.log(candidate_prob/target_prob)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 6.296890976997244\n" ] } ], "source": [ "import numpy as np\n", "\n", "def kl_divergence(p, q):\n", " \"\"\"\n", " Compute KL divergence for a single sample with two probabilities.\n", " \n", " :param p: First probability (true distribution)\n", " :param q: Second probability (approximating distribution)\n", " :return: KL divergence value\n", " \"\"\"\n", " # Ensure probabilities sum to 1\n", " p = np.array([p, 1-p])\n", " q = np.array([q, 1-q])\n", " \n", " # Avoid division by zero\n", " epsilon = 1e-10\n", " q = np.clip(q, epsilon, 1-epsilon)\n", " \n", " # Compute KL divergence\n", " kl_div = np.sum(p * np.log(p / q))\n", " \n", " return kl_div\n", "\n", "# Example usage\n", "p = 0.7 # probability from true distribution\n", "q = 1.0 # probability from approximating distribution\n", "\n", "result = kl_divergence(p, q)\n", "print(f\"KL divergence: {result}\")" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: inf\n" ] } ], "source": [ "from scipy.special import kl_div\n", "\n", "# For multiple probabilities\n", "p = np.array([0.3, 0.7])\n", "q = np.array([0.0, 1.0])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This library is not useful if we have extreme values" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetimefirst_outcome_probsecond_outcome_probkl_divergenceoff_by_perc
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [currentAnswer, id, openingTimestamp, market_creator, opening_datetime, first_outcome_prob, second_outcome_prob, kl_divergence, off_by_perc]\n", "Index: []" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets_div = pd.read_parquet(\"../data/closed_markets_div.parquet\")\n", "markets_div.head()" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetimefirst_outcome_probsecond_outcome_probkl_divergence
642yes0x4eba0ec2464ec7c746e8872078165c8ad52d346f1727136000quickstart2024-09-24 02:00:000.53920.46089.920241
643no0x3535b4cea3ea7b1862fbe1af5a458702cc1c0dad1727136000quickstart2024-09-24 02:00:000.28120.71885.880786
644yes0x7e191324f0efb8aa20b8c702d95e812e55b4179c1727136000pearl2024-09-24 02:00:000.50000.500010.819778
645no0xd1bd18d7601d106639f922f1b5d2eda025c26be71727136000quickstart2024-09-24 02:00:000.50000.500010.819778
646no0x61065f131e2ec851c40765bb0b078a318a36f53e1727136000quickstart2024-09-24 02:00:000.50000.500010.819778
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "642 yes 0x4eba0ec2464ec7c746e8872078165c8ad52d346f \n", "643 no 0x3535b4cea3ea7b1862fbe1af5a458702cc1c0dad \n", "644 yes 0x7e191324f0efb8aa20b8c702d95e812e55b4179c \n", "645 no 0xd1bd18d7601d106639f922f1b5d2eda025c26be7 \n", "646 no 0x61065f131e2ec851c40765bb0b078a318a36f53e \n", "\n", " openingTimestamp market_creator opening_datetime first_outcome_prob \\\n", "642 1727136000 quickstart 2024-09-24 02:00:00 0.5392 \n", "643 1727136000 quickstart 2024-09-24 02:00:00 0.2812 \n", "644 1727136000 pearl 2024-09-24 02:00:00 0.5000 \n", "645 1727136000 quickstart 2024-09-24 02:00:00 0.5000 \n", "646 1727136000 quickstart 2024-09-24 02:00:00 0.5000 \n", "\n", " second_outcome_prob kl_divergence \n", "642 0.4608 9.920241 \n", "643 0.7188 5.880786 \n", "644 0.5000 10.819778 \n", "645 0.5000 10.819778 \n", "646 0.5000 10.819778 " ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets_div.tail()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "647" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(markets_div)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import math" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def kl_divergence(p, q):\n", " \"\"\"\n", " Compute KL divergence for a single sample with two probabilities.\n", "\n", " :param p: First probability (true distribution)\n", " :param q: Second probability (approximating distribution)\n", " :return: KL divergence value\n", " \"\"\"\n", " # Ensure probabilities sum to 1\n", " p = np.array([p, 1 - p])\n", " q = np.array([q, 1 - q])\n", "\n", " # Avoid division by zero\n", " epsilon = 1e-10\n", " q = np.clip(q, epsilon, 1 - epsilon)\n", " print(q)\n", "\n", " # Compute KL divergence\n", " kl_div = np.sum(p * np.log(p / q))\n", "\n", " return kl_div" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-22.82067008, 1.6847004 ])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p= 0\n", "q = 0.8145\n", "p = np.array([p, 1 - p])\n", "q = np.array([q, 1 - q])\n", "epsilon = 1e-10\n", "p = np.clip(p, epsilon, 1 - epsilon)\n", "np.log(p/q)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-2.28206701e-09, 1.68470040e+00])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p*np.log(p/q)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.6847003943841101" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sum(p * np.log(p / q))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.791663620863367" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0.8145\n", "q = 1.0\n", "kl_divergence(p,q)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.16397451204513597" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0.99\n", "q = 0.8145\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 0.16397451204513597\n" ] } ], "source": [ "from scipy.special import kl_div\n", "\n", "# For multiple probabilities\n", "p = np.array([0.99, 0.01])\n", "q = np.array([0.8145, 0.1855])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2051808486854041" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p= 1\n", "q = 0.8145\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 0.20518085094003724\n" ] } ], "source": [ "# For multiple probabilities\n", "p = np.array([1.0, 0.0])\n", "q = np.array([0.8145, 0.1855])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1.e+00 1.e-10]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gp/02mb1d514ng739czlxw1lhh00000gn/T/ipykernel_28964/3714966623.py:19: RuntimeWarning: divide by zero encountered in log\n", " kl_div = np.sum(p * np.log(p / q))\n", "/var/folders/gp/02mb1d514ng739czlxw1lhh00000gn/T/ipykernel_28964/3714966623.py:19: RuntimeWarning: invalid value encountered in multiply\n", " kl_div = np.sum(p * np.log(p / q))\n" ] }, { "data": { "text/plain": [ "nan" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0\n", "q = 1\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 0.010050335853501449\n" ] } ], "source": [ "p = np.array([0.0, 1.0])\n", "q = np.array([0.01, 0.99])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.17425697504355725" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0.01\n", "q = 0\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: inf\n" ] } ], "source": [ "p = np.array([0.01, 0.99])\n", "q = np.array([0.0, 1.0])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.01, 0.99])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "epsilon = 1e-10\n", "q = 0\n", "q = np.clip(p, epsilon, 1 - epsilon)\n", "q" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# New function" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def kl_divergence(P, Q):\n", " \"\"\"\n", " Compute KL divergence for a single sample with two prob distributions.\n", "\n", " :param P: True distribution)\n", " :param Q: Approximating distribution)\n", " :return: KL divergence value\n", " \"\"\"\n", " # Review edge cases\n", " if P[0] == Q[0]:\n", " return 0.0\n", " # If P is complete opposite of Q, divergence is some max value.\n", " # Here set to 20--allows for Q [\\mu, 1-\\mu] or Q[1-\\mu, \\mu] where \\mu = 10^-8\n", " if P[0] == Q[1]:\n", " return 20\n", "\n", " nonzero = P > 0.0\n", " # Compute KL divergence\n", " kl_div = np.sum(P[nonzero] * np.log(P[nonzero] / Q[nonzero]))\n", "\n", " return kl_div" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0\n" ] } ], "source": [ "P = np.array([0.0, 1.0])\n", "Q = np.array([0.0, 1.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20\n" ] } ], "source": [ "P = np.array([0.0, 1.0])\n", "Q = np.array([1.0, 0.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20\n" ] } ], "source": [ "P = np.array([1.0, 0.0])\n", "Q = np.array([0.0, 1.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.010050335853501506\n" ] } ], "source": [ "P = np.array([0.0, 1.0])\n", "Q = np.array([0.01, 0.99])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.010050335853501506" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "1 * np.log(1 / 0.99)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5108256237659907\n" ] } ], "source": [ "P = np.array([1.0, 0.0])\n", "Q = np.array([0.60, 0.05])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13.815510557964274\n" ] } ], "source": [ "Q = np.array([1e-6, 0.999999])#or [0.99, 0.01]\n", "P = np.array([1.0, 0.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4.605170185988092" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log(1/0.01)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Assuming you have your dataframe as 'df'\n", "# If not, you can create a sample dataframe like this:\n", "df = pd.DataFrame({\n", " 'week': [1, 1, 1, 2, 2, 2, 3, 3, 3],\n", " 'kl_divergence': [0.1, 0.2, 0.15, 0.3, 0.25, 0.35, 0.4, 0.45, 0.5],\n", " 'off_by_perc': [5, 10, 7, 15, 12, 18, 20, 22, 25]\n", "})\n", "\n", "# Create the main figure and axis\n", "fig, ax1 = plt.subplots(figsize=(10, 6))\n", "\n", "# Create the boxplot using seaborn\n", "sns.boxplot(x='week', y='kl_divergence', data=df, ax=ax1)\n", "\n", "# Set labels and title for the main axis\n", "ax1.set_xlabel('Week')\n", "ax1.set_ylabel('KL Divergence')\n", "ax1.set_title('KL Divergence Boxplot with Off-by Percentage')\n", "\n", "# Create a secondary y-axis\n", "ax2 = ax1.twinx()\n", "\n", "# Plot the off_by_perc values on the secondary y-axis\n", "for i, week in enumerate(df['week'].unique()):\n", " off_by_perc = df[df['week'] == week]['off_by_perc']\n", " ax2.scatter([i] * len(off_by_perc), off_by_perc, color='red', alpha=0.01)\n", "\n", "# Set label for the secondary y-axis\n", "ax2.set_ylabel('Off-by Percentage')\n", "\n", "# Adjust the layout and display the plot\n", "plt.tight_layout()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "hf_dashboards", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }