Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
faad60e
update
Egor-Krivov Sep 27, 2023
1943eed
Working version
Egor-Krivov Oct 2, 2023
fad2249
Removed FRAGMENT SIZE hardcode
Egor-Krivov Oct 2, 2023
826c932
join update
Egor-Krivov Oct 2, 2023
6ed39e2
current solution
Egor-Krivov Oct 10, 2023
c36ee55
setup script
Egor-Krivov Oct 12, 2023
be7e46c
Merge branch 'duckdblabs:master' into modin
Egor-Krivov Oct 12, 2023
da91c48
fixed
Egor-Krivov Oct 12, 2023
9875fb8
Merge branch 'modin' of https://github.com/Egor-Krivov/db-benchmark i…
Egor-Krivov Oct 12, 2023
7405a95
Updates
Egor-Krivov Oct 12, 2023
01bdc14
moved HDK
Egor-Krivov Oct 13, 2023
379c099
Fixed formatting
Egor-Krivov Oct 13, 2023
52d22ba
Updated groupby
Egor-Krivov Oct 16, 2023
a1bbb3b
better name
Egor-Krivov Oct 27, 2023
e58ab0f
Merge branch 'master' into backend_modin
Egor-Krivov Oct 27, 2023
1408a2c
Merge remote-tracking branch 'origin/master' into backend_modin
Egor-Krivov Nov 7, 2023
d51206d
Update to latest modin
Egor-Krivov Nov 7, 2023
8768466
Update to latest HDK
Egor-Krivov Nov 8, 2023
2242d83
Fixed style
Egor-Krivov Nov 8, 2023
e5a1e0c
codestyle fix
Egor-Krivov Nov 8, 2023
59fb119
fixed interface
Egor-Krivov Nov 8, 2023
2e8388c
fixed solver
Egor-Krivov Nov 8, 2023
4f17138
added modin
Egor-Krivov Nov 8, 2023
e7aad15
cleaned up gitignore
Egor-Krivov Nov 9, 2023
655bba3
removed switch
Egor-Krivov Nov 13, 2023
aba390f
added regression test
Egor-Krivov Nov 13, 2023
4205d23
Merge remote-tracking branch 'origin/master' into backend_modin
Egor-Krivov Nov 24, 2023
f22b577
Fixed CPU count & miniconda activation
Egor-Krivov Nov 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
solution: [data.table, collapse, dplyr, pandas, modin, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
name: Regression Tests solo solutions
runs-on: ubuntu-20.04
env:
Expand Down Expand Up @@ -91,7 +91,7 @@ jobs:
name: ${{ matrix.solution }}-out.zip
path: ${{ matrix.solution }}-out.zip
if-no-files-found: error

regression-test-benchmark-runner-all-solutions:
needs: regression-test-benchmark-runner-solo-solutions
name: Regression Tests all solutions
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ metastore_db/*
*.md5
.Rproj.user
.Rhistory
miniconda
db-benchmark.Rproj
*/REVISION
token
Expand Down
27 changes: 27 additions & 0 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ solution.dict = {list(
"data.table" = list(name=c(short="data.table", long="data.table"), color=c(strong="blue", light="#7777FF")),
"dplyr" = list(name=c(short="dplyr", long="dplyr"), color=c(strong="red", light="#FF7777")),
"pandas" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")),
"modin" = list(name=c(short="modin", long="modin"), color=c(strong="blue4", light="#7799ff")),
"pydatatable" = list(name=c(short="pydatatable", long="(py)datatable"), color=c(strong="darkorange", light="orange")),
"spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")),
"dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")),
Expand Down Expand Up @@ -115,6 +116,18 @@ groupby.syntax.dict = {list(
"regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: x['v1'].corr(x['v2'])**2).rename(columns={None: 'r2'})",
"sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})"
)},
"modin" = {c(
"sum v1 by id1" = "DF.groupby('id1', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})",
"sum v1 by id1:id2" = "DF.groupby(['id1','id2'], as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})",
"sum v1 mean v3 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v3':'mean'})",
"mean v1:v3 by id4" = "DF.groupby('id4', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})",
"sum v1:v3 by id6" = "DF.groupby('id6', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})",
"median v3 sd v3 by id4 id5" = "DF.groupby(['id4','id5'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3': ['median','std']})",
"max v1 - min v2 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['id3','range_v1_v2']]",
"largest two v3 by id6" = "DF.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]",
"regression v1 v2 by id2 id4" = "query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)",
"sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})"
)},
"pydatatable" = {c(
"sum v1 by id1" = "DT[:, {'v1': sum(f.v1)}, by(f.id1)]",
"sum v1 by id1:id2" = "DT[:, {'v1': sum(f.v1)}, by(f.id1, f.id2)]",
Expand Down Expand Up @@ -253,6 +266,7 @@ groupby.syntax.dict = {list(
"data.table" = list(),
"dplyr" = list(),
"pandas" = list(),
"modin" = list(),
"pydatatable" = list(),
"spark" = list("not yet implemented: SPARK-26589" = "median v3 sd v3 by id4 id5"),
"dask" = list("not yet implemented: dask#4362" = "median v3 sd v3 by id4 id5"),
Expand Down Expand Up @@ -281,6 +295,8 @@ groupby.data.exceptions = {list(
"pandas" = {list(
"out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9
)},
"modin" = {list(
)},
"pydatatable" = {list(
"csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0")
)},
Expand Down Expand Up @@ -385,6 +401,13 @@ join.syntax.dict = {list(
"medium inner on factor" = "DF.merge(medium, on='id5')",
"big inner on int" = "DF.merge(big, on='id3')"
)},
"modin" = {c(
"small inner on int" = "DF.merge(small, on='id1')",
"medium inner on int" = "DF.merge(medium, on='id2')",
"medium outer on int" = "DF.merge(medium, how='left', on='id2')",
"medium inner on factor" = "DF.merge(medium, on='id5')",
"big inner on int" = "DF.merge(big, on='id3')"
)},
"pydatatable" = {c(
"small inner on int" = "y.key = 'id1'; DT[:, :, join(y)][isfinite(f.v2), :]",
"medium inner on int" = "y.key = 'id2'; DT[:, :, join(y)][isfinite(f.v2), :]",
Expand Down Expand Up @@ -447,6 +470,7 @@ join.query.exceptions = {list(
"data.table" = list(),
"dplyr" = list(),
"pandas" = list(),
"modin" = list(),
"pydatatable" = list(),
"spark" = list(),
"dask" = list(),
Expand All @@ -471,6 +495,9 @@ join.data.exceptions = {list(
"pandas" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # read_csv
)},
"modin" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
"pydatatable" = {list(
"csv reader NAs bug: datatable#2808" = "J1_1e9_NA_5_0",
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_0_1") # q5 out of memory due to a deep copy
Expand Down
2 changes: 2 additions & 0 deletions _control/solutions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ dplyr,groupby2014
pandas,groupby
pandas,join
pandas,groupby2014
modin,groupby
modin,join
pydatatable,groupby
pydatatable,join
spark,groupby
Expand Down
28 changes: 24 additions & 4 deletions _utils/repro.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,27 @@ cd pydatatable
virtualenv py-pydatatable --python=/usr/bin/python3.10
cd ../pandas
virtualenv py-pandas --python=/usr/bin/python3.10
#################
# Install modin #
#################
cd ../modin
virtualenv py-modin --python=/usr/bin/python3.10
curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh && \
sh install_miniconda.sh -u -b -p ./py-modin && \
rm -f install_miniconda.sh

eval source ./modin/py-modin/bin/activate
conda install -y conda-libmamba-solver

conda create --name modin -y
conda activate modin
echo "conda activate modin" >> ./py-modin/bin/activate

# install binaries
conda install -y -c conda-forge modin-hdk --solver=libmamba

conda deactivate
conda deactivate

cd ..


Expand All @@ -45,8 +64,9 @@ python3 -m pip install --upgrade pandas
deactivate

source ./modin/py-modin/bin/activate
python3 -m pip install --upgrade modin
deactivate
conda update modin-hdk -y -c conda-forge --solver=libmamba
conda deactivate
conda deactivate

source ./pydatatable/py-pydatatable/bin/activate
python3 -m pip install --upgrade git+https://github.com/h2oai/datatable
Expand All @@ -72,7 +92,7 @@ mv G1_1e7_1e2_0_0.csv data/
echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB"
cp run.conf run.conf.original
sed -i 's/groupby join groupby2014/groupby/g' run.conf
sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf
sed -i 's/data.table dplyr pandas modin pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf
sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf

# set sizes
Expand Down
Loading