{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "try:\n", " markets = pd.read_parquet(\"../data/fpmms.parquet\")\n", "except Exception:\n", " print(\"Error reading the parquet file\")\n", "\n", "markets[\"currentAnswer\"] = markets[\"currentAnswer\"].apply(lambda x: x.lower())\n", "# filter only markets with yes, no answers\n", "valid_answers = [\"yes\", \"no\"]\n", "markets = markets.loc[markets[\"currentAnswer\"].isin(valid_answers)]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4686" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(markets)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4686" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(markets.id.unique())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridtitlemarket_creator
0no0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5Will the first floating offshore wind research...quickstart
1no0x0020d13c89140b47e10db54cbd53852b90bc1391Will the Francis Scott Key Bridge in Baltimore...quickstart
2no0x003ae5e007cc38b3f86b0ed7c82f938a1285ac07Will FC Saarbrucken reach the final of the Ger...quickstart
3yes0x004c8d4c619dc6b9caa940f5ea7ef699ae85359cWill the pro-life activists convicted for 'con...quickstart
4yes0x005e3f7a90585acbec807425a750fbba1d0c2b5cWill Apple announce the release of a new M4 ch...quickstart
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "0 no 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 \n", "1 no 0x0020d13c89140b47e10db54cbd53852b90bc1391 \n", "2 no 0x003ae5e007cc38b3f86b0ed7c82f938a1285ac07 \n", "3 yes 0x004c8d4c619dc6b9caa940f5ea7ef699ae85359c \n", "4 yes 0x005e3f7a90585acbec807425a750fbba1d0c2b5c \n", "\n", " title market_creator \n", "0 Will the first floating offshore wind research... quickstart \n", "1 Will the Francis Scott Key Bridge in Baltimore... quickstart \n", "2 Will FC Saarbrucken reach the final of the Ger... quickstart \n", "3 Will the pro-life activists convicted for 'con... quickstart \n", "4 Will Apple announce the release of a new M4 ch... quickstart " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "trades = pd.read_parquet(\"../data/fpmmTrades.parquet\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
collateralAmountcollateralAmountUSDcollateralTokencreationTimestamptrader_addressfeeAmountidoldOutcomeTokenMarginalPriceoutcomeIndexoutcomeTokenMarginalPrice...market_creatorfpmm.answerFinalizedTimestampfpmm.arbitrationOccurredfpmm.currentAnswerfpmm.idfpmm.isPendingArbitrationfpmm.openingTimestampfpmm.outcomesfpmm.titlefpmm.condition.id
04504264746507386880.45042696940341457163080730941680060xe91d153e0b41518a2ce8dd3d7944fa863463a97d17245534550x022b36c50b85b8ae7addfb8a35d76c59d581483490085294930147730x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x02...0.59278521060961027063412533557212910.6171295391012242250994586583534301...quickstart1725071760False0x00000000000000000000000000000000000000000000...0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5False1724976000[Yes, No]Will the first floating offshore wind research...0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1...
16101632145469414000.61016362322151501356540073370152980xe91d153e0b41518a2ce8dd3d7944fa863463a97d17248119400x034c4ad84f7ac6638bf19300d5bbe7d9b981e736122032642909388280x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x03...0.84299263652375506193482212939481210.8523396372892128845826889719620915...quickstart1725071760False0x00000000000000000000000000000000000000000000...0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5False1724976000[Yes, No]Will the first floating offshore wind research...0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1...
27890650923324606720.78906441205273240719087938227960860xe91d153e0b41518a2ce8dd3d7944fa863463a97d17248157550x09e9d42a029e8b0c2df3871709a762117a681d92157813018466492130x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x09...0.798377574371244289110459877033902810.8152123711444691659642000374025623...quickstart1725071760False0x00000000000000000000000000000000000000000000...0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5False1724976000[Yes, No]Will the first floating offshore wind research...0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1...
310000000000000000001.0000006053836603290484917949391260xe91d153e0b41518a2ce8dd3d7944fa863463a97d17245466200x09e9d42a029e8b0c2df3871709a762117a681d92200000000000000000x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x09...0.511074590773343880544707225262270810.5746805204222762335911904727318937...quickstart1725071760False0x00000000000000000000000000000000000000000000...0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5False1724976000[Yes, No]Will the first floating offshore wind research...0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1...
41000000000000000000.10000042712628624195473946465679060xe91d153e0b41518a2ce8dd3d7944fa863463a97d17247712600x0d049dcaece0ecb6fc81a460da7bcc2a4785d6e520000000000000000x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x0d...0.271396821866231938898868198738940800.2804586217805511523845593360379658...quickstart1725071760False0x00000000000000000000000000000000000000000000...0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5False1724976000[Yes, No]Will the first floating offshore wind research...0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1...
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " collateralAmount collateralAmountUSD \\\n", "0 450426474650738688 0.4504269694034145716308073094168006 \n", "1 610163214546941400 0.6101636232215150135654007337015298 \n", "2 789065092332460672 0.7890644120527324071908793822796086 \n", "3 1000000000000000000 1.000000605383660329048491794939126 \n", "4 100000000000000000 0.1000004271262862419547394646567906 \n", "\n", " collateralToken creationTimestamp \\\n", "0 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1724553455 \n", "1 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1724811940 \n", "2 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1724815755 \n", "3 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1724546620 \n", "4 0xe91d153e0b41518a2ce8dd3d7944fa863463a97d 1724771260 \n", "\n", " trader_address feeAmount \\\n", "0 0x022b36c50b85b8ae7addfb8a35d76c59d5814834 9008529493014773 \n", "1 0x034c4ad84f7ac6638bf19300d5bbe7d9b981e736 12203264290938828 \n", "2 0x09e9d42a029e8b0c2df3871709a762117a681d92 15781301846649213 \n", "3 0x09e9d42a029e8b0c2df3871709a762117a681d92 20000000000000000 \n", "4 0x0d049dcaece0ecb6fc81a460da7bcc2a4785d6e5 2000000000000000 \n", "\n", " id \\\n", "0 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x02... \n", "1 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x03... \n", "2 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x09... \n", "3 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x09... \n", "4 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f50x0d... \n", "\n", " oldOutcomeTokenMarginalPrice outcomeIndex \\\n", "0 0.592785210609610270634125335572129 1 \n", "1 0.842992636523755061934822129394812 1 \n", "2 0.7983775743712442891104598770339028 1 \n", "3 0.5110745907733438805447072252622708 1 \n", "4 0.2713968218662319388988681987389408 0 \n", "\n", " outcomeTokenMarginalPrice ... market_creator \\\n", "0 0.6171295391012242250994586583534301 ... quickstart \n", "1 0.8523396372892128845826889719620915 ... quickstart \n", "2 0.8152123711444691659642000374025623 ... quickstart \n", "3 0.5746805204222762335911904727318937 ... quickstart \n", "4 0.2804586217805511523845593360379658 ... quickstart \n", "\n", " fpmm.answerFinalizedTimestamp fpmm.arbitrationOccurred \\\n", "0 1725071760 False \n", "1 1725071760 False \n", "2 1725071760 False \n", "3 1725071760 False \n", "4 1725071760 False \n", "\n", " fpmm.currentAnswer \\\n", "0 0x00000000000000000000000000000000000000000000... \n", "1 0x00000000000000000000000000000000000000000000... \n", "2 0x00000000000000000000000000000000000000000000... \n", "3 0x00000000000000000000000000000000000000000000... \n", "4 0x00000000000000000000000000000000000000000000... \n", "\n", " fpmm.id fpmm.isPendingArbitration \\\n", "0 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 False \n", "1 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 False \n", "2 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 False \n", "3 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 False \n", "4 0x0017cd58d6a7ee1451388c7d5b1051b4c0a041f5 False \n", "\n", " fpmm.openingTimestamp fpmm.outcomes \\\n", "0 1724976000 [Yes, No] \n", "1 1724976000 [Yes, No] \n", "2 1724976000 [Yes, No] \n", "3 1724976000 [Yes, No] \n", "4 1724976000 [Yes, No] \n", "\n", " fpmm.title \\\n", "0 Will the first floating offshore wind research... \n", "1 Will the first floating offshore wind research... \n", "2 Will the first floating offshore wind research... \n", "3 Will the first floating offshore wind research... \n", "4 Will the first floating offshore wind research... \n", "\n", " fpmm.condition.id \n", "0 0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1... \n", "1 0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1... \n", "2 0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1... \n", "3 0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1... \n", "4 0x0e940f12f30e928e4879c52d065d9da739a3d3f020d1... \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trades.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 26835 entries, 0 to 26834\n", "Data columns (total 24 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 collateralAmount 26835 non-null object\n", " 1 collateralAmountUSD 26835 non-null object\n", " 2 collateralToken 26835 non-null object\n", " 3 creationTimestamp 26835 non-null object\n", " 4 trader_address 26835 non-null object\n", " 5 feeAmount 26835 non-null object\n", " 6 id 26835 non-null object\n", " 7 oldOutcomeTokenMarginalPrice 26835 non-null object\n", " 8 outcomeIndex 26835 non-null object\n", " 9 outcomeTokenMarginalPrice 26835 non-null object\n", " 10 outcomeTokensTraded 26835 non-null object\n", " 11 title 26835 non-null object\n", " 12 transactionHash 26835 non-null object\n", " 13 type 26835 non-null object\n", " 14 market_creator 26835 non-null object\n", " 15 fpmm.answerFinalizedTimestamp 24829 non-null object\n", " 16 fpmm.arbitrationOccurred 26835 non-null bool \n", " 17 fpmm.currentAnswer 24829 non-null object\n", " 18 fpmm.id 26835 non-null object\n", " 19 fpmm.isPendingArbitration 26835 non-null bool \n", " 20 fpmm.openingTimestamp 26835 non-null object\n", " 21 fpmm.outcomes 26835 non-null object\n", " 22 fpmm.title 26835 non-null object\n", " 23 fpmm.condition.id 26835 non-null object\n", "dtypes: bool(2), object(22)\n", "memory usage: 4.6+ MB\n" ] } ], "source": [ "trades.info()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['collateralAmount', 'collateralAmountUSD', 'collateralToken',\n", " 'creationTimestamp', 'trader_address', 'feeAmount', 'id',\n", " 'oldOutcomeTokenMarginalPrice', 'outcomeIndex',\n", " 'outcomeTokenMarginalPrice', 'outcomeTokensTraded', 'title',\n", " 'transactionHash', 'type', 'market_creator',\n", " 'fpmm.answerFinalizedTimestamp', 'fpmm.arbitrationOccurred',\n", " 'fpmm.currentAnswer', 'fpmm.id', 'fpmm.isPendingArbitration',\n", " 'fpmm.openingTimestamp', 'fpmm.outcomes', 'fpmm.title',\n", " 'fpmm.condition.id'],\n", " dtype='object')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trades.columns" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "markets = list(trades[\"fpmm.id\"].unique())" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "803" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(markets)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gp/02mb1d514ng739czlxw1lhh00000gn/T/ipykernel_3094/2495807215.py:12: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " trade_markets.rename(\n" ] } ], "source": [ "from datetime import datetime\n", "INVALID_ANSWER_HEX = (\n", " \"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff\"\n", ")\n", "columns_of_interest = [\n", " \"fpmm.currentAnswer\",\n", " \"fpmm.id\",\n", " \"fpmm.openingTimestamp\",\n", " \"market_creator\",\n", " ]\n", "trade_markets = trades[columns_of_interest]\n", "trade_markets.rename(\n", " columns={\n", " \"fpmm.currentAnswer\": \"currentAnswer\",\n", " \"fpmm.openingTimestamp\": \"openingTimestamp\",\n", " \"fpmm.id\": \"id\",\n", " },\n", " inplace=True,\n", ")\n", "trade_markets = trade_markets.drop_duplicates(subset=['id'], keep='last')\n", "# remove invalid answers\n", "\n", "trade_markets = trade_markets.loc[trade_markets[\"currentAnswer\"]!= INVALID_ANSWER_HEX]\n", "trade_markets[\"currentAnswer\"] = trade_markets[\"currentAnswer\"].apply(\n", " lambda x: convert_hex_to_int(x)\n", ")\n", "trade_markets[\"opening_datetime\"] = trade_markets[\"openingTimestamp\"].apply(\n", " lambda x: datetime.fromtimestamp(int(x))\n", ")\n", "trade_markets = trade_markets.sort_values(by=\"opening_datetime\", ascending=True)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets.id.unique())" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetime
15736NaN0x92ed80e541f642b564f992245abe640282dd273c1727568000quickstart2024-09-29 02:00:00
6272NaN0x4002481fe7bc39c1baa4b5988c038da13ed058321727568000quickstart2024-09-29 02:00:00
24383NaN0xf820d06509027c309b00cd386055982d9bea0c101727568000quickstart2024-09-29 02:00:00
12418NaN0x74e0fa941341ebe980fbdcfa8b40244cb448eb561727568000quickstart2024-09-29 02:00:00
4754NaN0x2f44e179b5cc964e504046bac31d6945a0652af21727568000quickstart2024-09-29 02:00:00
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "15736 NaN 0x92ed80e541f642b564f992245abe640282dd273c \n", "6272 NaN 0x4002481fe7bc39c1baa4b5988c038da13ed05832 \n", "24383 NaN 0xf820d06509027c309b00cd386055982d9bea0c10 \n", "12418 NaN 0x74e0fa941341ebe980fbdcfa8b40244cb448eb56 \n", "4754 NaN 0x2f44e179b5cc964e504046bac31d6945a0652af2 \n", "\n", " openingTimestamp market_creator opening_datetime \n", "15736 1727568000 quickstart 2024-09-29 02:00:00 \n", "6272 1727568000 quickstart 2024-09-29 02:00:00 \n", "24383 1727568000 quickstart 2024-09-29 02:00:00 \n", "12418 1727568000 quickstart 2024-09-29 02:00:00 \n", "4754 1727568000 quickstart 2024-09-29 02:00:00 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trade_markets.tail()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "719" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "trade_markets.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetime
207921.00xcc9c26a86dd55aa04dcb0066c9b8fca2983f407d1727136000quickstart2024-09-24 02:00:00
211301.00xd1bd18d7601d106639f922f1b5d2eda025c26be71727136000quickstart2024-09-24 02:00:00
74940.00x4eba0ec2464ec7c746e8872078165c8ad52d346f1727136000quickstart2024-09-24 02:00:00
99111.00x61065f131e2ec851c40765bb0b078a318a36f53e1727136000quickstart2024-09-24 02:00:00
261820.00x7e191324f0efb8aa20b8c702d95e812e55b4179c1727136000pearl2024-09-24 02:00:00
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "20792 1.0 0xcc9c26a86dd55aa04dcb0066c9b8fca2983f407d \n", "21130 1.0 0xd1bd18d7601d106639f922f1b5d2eda025c26be7 \n", "7494 0.0 0x4eba0ec2464ec7c746e8872078165c8ad52d346f \n", "9911 1.0 0x61065f131e2ec851c40765bb0b078a318a36f53e \n", "26182 0.0 0x7e191324f0efb8aa20b8c702d95e812e55b4179c \n", "\n", " openingTimestamp market_creator opening_datetime \n", "20792 1727136000 quickstart 2024-09-24 02:00:00 \n", "21130 1727136000 quickstart 2024-09-24 02:00:00 \n", "7494 1727136000 quickstart 2024-09-24 02:00:00 \n", "9911 1727136000 quickstart 2024-09-24 02:00:00 \n", "26182 1727136000 pearl 2024-09-24 02:00:00 " ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trade_markets.tail()" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "import math\n", "def market_KL_divergence(market_row: pd.DataFrame) -> float:\n", " \"\"\"Function to compute the divergence based on the formula\n", " Formula in https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence\"\"\"\n", " current_answer = market_row.currentAnswer # \"yes\", \"no\"\n", " target_prob = 1 # = 100%\n", " if current_answer == \"yes\":\n", " candidate_prob = market_row.first_outcome_prob\n", " else: # \"no\"\n", " candidate_prob = market_row.second_outcome_prob\n", "\n", " # we have only one sample, the final probability based on tokens\n", " kl_divergence = candidate_prob * round(math.log(candidate_prob / target_prob), 4)\n", " return kl_divergence" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "719" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(trade_markets)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "719" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(list(trade_markets.id.unique()))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "currentAnswer\n", "0x0000000000000000000000000000000000000000000000000000000000000001 407\n", "0x0000000000000000000000000000000000000000000000000000000000000000 241\n", "Name: count, dtype: int64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trade_markets.currentAnswer.value_counts()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "INVALID_ANSWER_HEX = (\n", " \"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff\"\n", ")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "def convert_hex_to_int(x):\n", " \"\"\"Convert hex to int\"\"\"\n", " if isinstance(x, float):\n", " return np.nan\n", " if isinstance(x, str):\n", " if x == INVALID_ANSWER_HEX:\n", " return -1\n", " answer = int(x, 16)\n", " return answer\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "market_ids = list(markets.id.unique())\n", "for i in range(len(trade_markets)):\n", " market = trade_markets.iloc[i]\n", " if market.id in market_ids:\n", " current_answer = convert_hex_to_int(market.currentAnswer)\n", " market_answer = markets.loc[markets[\"id\"]==market.id].currentAnswer.values[0]\n", " print(f\"current answer = {current_answer} and market answer {market_answer}\")\n", " trade_markets.at[i, \"currentAnswer\"] = market_answer" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "markets[\"currentAnswer\"] = markets[\"currentAnswer\"].apply(lambda x: convert_hex_to_int(x))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "currentAnswer\n", " 1.0 407\n", " 0.0 241\n", "-1.0 84\n", "Name: count, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets.currentAnswer.value_counts()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0769610411361284" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import math\n", "\n", "candidate_prob = 9/25\n", "target_prob = 1/3\n", "math.log(candidate_prob/target_prob)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 6.296890976997244\n" ] } ], "source": [ "import numpy as np\n", "\n", "def kl_divergence(p, q):\n", " \"\"\"\n", " Compute KL divergence for a single sample with two probabilities.\n", " \n", " :param p: First probability (true distribution)\n", " :param q: Second probability (approximating distribution)\n", " :return: KL divergence value\n", " \"\"\"\n", " # Ensure probabilities sum to 1\n", " p = np.array([p, 1-p])\n", " q = np.array([q, 1-q])\n", " \n", " # Avoid division by zero\n", " epsilon = 1e-10\n", " q = np.clip(q, epsilon, 1-epsilon)\n", " \n", " # Compute KL divergence\n", " kl_div = np.sum(p * np.log(p / q))\n", " \n", " return kl_div\n", "\n", "# Example usage\n", "p = 0.7 # probability from true distribution\n", "q = 1.0 # probability from approximating distribution\n", "\n", "result = kl_divergence(p, q)\n", "print(f\"KL divergence: {result}\")" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: inf\n" ] } ], "source": [ "from scipy.special import kl_div\n", "\n", "# For multiple probabilities\n", "p = np.array([0.3, 0.7])\n", "q = np.array([0.0, 1.0])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This library is not useful if we have extreme values" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetimefirst_outcome_probsecond_outcome_probkl_divergence
0yes0x67490193504b49a247d6a3ba7d441e9894d9615f1722470400quickstart2024-08-01 02:00:000.81450.18553.791664
1no0x17f2c97bf52a79671878201bf2995a3b6daba0411722470400quickstart2024-08-01 02:00:000.19750.80254.050688
2no0xbca6aa704a02a5c5a766ff829dacc81aee5547cf1722470400quickstart2024-08-01 02:00:000.69690.303115.433247
3no0x221c71bab604691b0b8805c1c433fc8e22123a671722470400pearl2024-08-01 02:00:000.47570.524310.261432
4no0xe4d078b9be12319c0063f58dc10f19604a5df1631722470400quickstart2024-08-01 02:00:000.34730.65277.351119
\n", "
" ], "text/plain": [ " currentAnswer id openingTimestamp \\\n", "0 yes 0x67490193504b49a247d6a3ba7d441e9894d9615f 1722470400 \n", "1 no 0x17f2c97bf52a79671878201bf2995a3b6daba041 1722470400 \n", "2 no 0xbca6aa704a02a5c5a766ff829dacc81aee5547cf 1722470400 \n", "3 no 0x221c71bab604691b0b8805c1c433fc8e22123a67 1722470400 \n", "4 no 0xe4d078b9be12319c0063f58dc10f19604a5df163 1722470400 \n", "\n", " market_creator opening_datetime first_outcome_prob second_outcome_prob \\\n", "0 quickstart 2024-08-01 02:00:00 0.8145 0.1855 \n", "1 quickstart 2024-08-01 02:00:00 0.1975 0.8025 \n", "2 quickstart 2024-08-01 02:00:00 0.6969 0.3031 \n", "3 pearl 2024-08-01 02:00:00 0.4757 0.5243 \n", "4 quickstart 2024-08-01 02:00:00 0.3473 0.6527 \n", "\n", " kl_divergence \n", "0 3.791664 \n", "1 4.050688 \n", "2 15.433247 \n", "3 10.261432 \n", "4 7.351119 " ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets_div = pd.read_parquet(\"../data/closed_markets_div.parquet\")\n", "markets_div.head()" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
currentAnsweridopeningTimestampmarket_creatoropening_datetimefirst_outcome_probsecond_outcome_probkl_divergence
642yes0x4eba0ec2464ec7c746e8872078165c8ad52d346f1727136000quickstart2024-09-24 02:00:000.53920.46089.920241
643no0x3535b4cea3ea7b1862fbe1af5a458702cc1c0dad1727136000quickstart2024-09-24 02:00:000.28120.71885.880786
644yes0x7e191324f0efb8aa20b8c702d95e812e55b4179c1727136000pearl2024-09-24 02:00:000.50000.500010.819778
645no0xd1bd18d7601d106639f922f1b5d2eda025c26be71727136000quickstart2024-09-24 02:00:000.50000.500010.819778
646no0x61065f131e2ec851c40765bb0b078a318a36f53e1727136000quickstart2024-09-24 02:00:000.50000.500010.819778
\n", "
" ], "text/plain": [ " currentAnswer id \\\n", "642 yes 0x4eba0ec2464ec7c746e8872078165c8ad52d346f \n", "643 no 0x3535b4cea3ea7b1862fbe1af5a458702cc1c0dad \n", "644 yes 0x7e191324f0efb8aa20b8c702d95e812e55b4179c \n", "645 no 0xd1bd18d7601d106639f922f1b5d2eda025c26be7 \n", "646 no 0x61065f131e2ec851c40765bb0b078a318a36f53e \n", "\n", " openingTimestamp market_creator opening_datetime first_outcome_prob \\\n", "642 1727136000 quickstart 2024-09-24 02:00:00 0.5392 \n", "643 1727136000 quickstart 2024-09-24 02:00:00 0.2812 \n", "644 1727136000 pearl 2024-09-24 02:00:00 0.5000 \n", "645 1727136000 quickstart 2024-09-24 02:00:00 0.5000 \n", "646 1727136000 quickstart 2024-09-24 02:00:00 0.5000 \n", "\n", " second_outcome_prob kl_divergence \n", "642 0.4608 9.920241 \n", "643 0.7188 5.880786 \n", "644 0.5000 10.819778 \n", "645 0.5000 10.819778 \n", "646 0.5000 10.819778 " ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "markets_div.tail()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "647" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(markets_div)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import math" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def kl_divergence(p, q):\n", " \"\"\"\n", " Compute KL divergence for a single sample with two probabilities.\n", "\n", " :param p: First probability (true distribution)\n", " :param q: Second probability (approximating distribution)\n", " :return: KL divergence value\n", " \"\"\"\n", " # Ensure probabilities sum to 1\n", " p = np.array([p, 1 - p])\n", " q = np.array([q, 1 - q])\n", "\n", " # Avoid division by zero\n", " epsilon = 1e-10\n", " q = np.clip(q, epsilon, 1 - epsilon)\n", " print(q)\n", "\n", " # Compute KL divergence\n", " kl_div = np.sum(p * np.log(p / q))\n", "\n", " return kl_div" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-22.82067008, 1.6847004 ])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p= 0\n", "q = 0.8145\n", "p = np.array([p, 1 - p])\n", "q = np.array([q, 1 - q])\n", "epsilon = 1e-10\n", "p = np.clip(p, epsilon, 1 - epsilon)\n", "np.log(p/q)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-2.28206701e-09, 1.68470040e+00])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p*np.log(p/q)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.6847003943841101" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sum(p * np.log(p / q))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.791663620863367" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0.8145\n", "q = 1.0\n", "kl_divergence(p,q)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.16397451204513597" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0.99\n", "q = 0.8145\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 0.16397451204513597\n" ] } ], "source": [ "from scipy.special import kl_div\n", "\n", "# For multiple probabilities\n", "p = np.array([0.99, 0.01])\n", "q = np.array([0.8145, 0.1855])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2051808486854041" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p= 1\n", "q = 0.8145\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 0.20518085094003724\n" ] } ], "source": [ "# For multiple probabilities\n", "p = np.array([1.0, 0.0])\n", "q = np.array([0.8145, 0.1855])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1.e+00 1.e-10]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gp/02mb1d514ng739czlxw1lhh00000gn/T/ipykernel_28964/3714966623.py:19: RuntimeWarning: divide by zero encountered in log\n", " kl_div = np.sum(p * np.log(p / q))\n", "/var/folders/gp/02mb1d514ng739czlxw1lhh00000gn/T/ipykernel_28964/3714966623.py:19: RuntimeWarning: invalid value encountered in multiply\n", " kl_div = np.sum(p * np.log(p / q))\n" ] }, { "data": { "text/plain": [ "nan" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0\n", "q = 1\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: 0.010050335853501449\n" ] } ], "source": [ "p = np.array([0.0, 1.0])\n", "q = np.array([0.01, 0.99])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.17425697504355725" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = 0.01\n", "q = 0\n", "kl_divergence(p, q)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KL divergence: inf\n" ] } ], "source": [ "p = np.array([0.01, 0.99])\n", "q = np.array([0.0, 1.0])\n", "\n", "kl = np.sum(kl_div(p, q))\n", "print(f\"KL divergence: {kl}\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.01, 0.99])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "epsilon = 1e-10\n", "q = 0\n", "q = np.clip(p, epsilon, 1 - epsilon)\n", "q" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# New function" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "def kl_divergence(P, Q):\n", " \"\"\"\n", " Compute KL divergence for a single sample with two prob distributions.\n", "\n", " :param P: True distribution)\n", " :param Q: Approximating distribution)\n", " :return: KL divergence value\n", " \"\"\"\n", " # Review edge cases\n", " if P[0] == Q[0]:\n", " return 0.0\n", " # If P is complete opposite of Q, divergence is some max value.\n", " # Here set to 20--allows for Q [\\mu, 1-\\mu] or Q[1-\\mu, \\mu] where \\mu = 10^-8\n", " if P[0] == Q[1]:\n", " return 20\n", "\n", " nonzero = P > 0.0\n", " # Compute KL divergence\n", " kl_div = np.sum(P[nonzero] * np.log(P[nonzero] / Q[nonzero]))\n", "\n", " return kl_div" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0\n" ] } ], "source": [ "P = np.array([0.0, 1.0])\n", "Q = np.array([0.0, 1.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20\n" ] } ], "source": [ "P = np.array([0.0, 1.0])\n", "Q = np.array([1.0, 0.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20\n" ] } ], "source": [ "P = np.array([1.0, 0.0])\n", "Q = np.array([0.0, 1.0])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.010050335853501506\n" ] } ], "source": [ "P = np.array([0.0, 1.0])\n", "Q = np.array([0.01, 0.99])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.010050335853501506" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "1 * np.log(1 / 0.99)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.605170185988092\n" ] } ], "source": [ "P = np.array([1.0, 0.0])\n", "Q = np.array([0.01, 0.99])\n", "print(kl_divergence(P,Q))" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4.605170185988092" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log(1/0.01)" ] } ], "metadata": { "kernelspec": { "display_name": "hf_dashboards", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }