Skip to content

Commit

Permalink
Merge pull request #210 from pmcgleenon/datafusion-40.0
Browse files Browse the repository at this point in the history
Updated for datafusion version 40.0.0
  • Loading branch information
rschu1ze authored Aug 2, 2024
2 parents 92d0fa5 + 3fdc64d commit bf07cf7
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 98 deletions.
8 changes: 4 additions & 4 deletions datafusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe
1. `cd ClickBench/datafusion`
1. `vi benchmark.sh` and modify following line to target Datafusion version
```
git checkout 36.0.0
git checkout 40.0.0
```
1. `bash benchmark.sh`

### Know Issues:

1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in quries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`)
2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in quries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`)
3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these quries in mac, you'll get some errors for quries contain binary format apache/arrow-datafusion#3050
1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`)
2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`)
3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050


## Generate full human readable results (for debugging)
Expand Down
2 changes: 1 addition & 1 deletion datafusion/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ sudo yum install gcc -y
# Install DataFusion main branch
git clone https://github.com/apache/arrow-datafusion.git
cd arrow-datafusion/datafusion-cli
git checkout 36.0.0
git checkout 40.0.0
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release
export PATH="`pwd`/target/release:$PATH"
cd ../..
Expand Down
90 changes: 45 additions & 45 deletions datafusion/results/partitioned.json
Original file line number Diff line number Diff line change
@@ -1,58 +1,58 @@
{
"system": "DataFusion (Parquet, partitioned)",
"date": "2024-03-07",
"date": "2024-07-27",
"machine": "c6a.4xlarge, 500gb gp2",
"cluster_size": 1,
"comment": "v36.0.0 (bf6f83b)",
"comment": "v40.0.0 (4cae813)",

"tags": ["Rust", "column-oriented", "embedded", "stateless"],

"load_time": 0,
"data_size": 14779976446,

"result": [
[0.039, 0.011, 0.009],
[0.081, 0.028, 0.025],
[0.164, 0.070, 0.069],
[0.369, 0.076, 0.073],
[1.266, 0.782, 0.782],
[1.270, 1.172, 1.182],
[0.058, 0.031, 0.033],
[0.056, 0.027, 0.027],
[1.481, 1.412, 1.389],
[1.256, 0.964, 0.968],
[0.469, 0.274, 0.279],
[0.790, 0.309, 0.308],
[1.364, 1.237, 1.255],
[3.424, 2.509, 2.520],
[1.468, 1.387, 1.402],
[0.966, 0.899, 0.900],
[3.151, 2.654, 2.619],
[3.090, 2.555, 2.580],
[6.834, 5.596, 5.624],
[0.396, 0.066, 0.068],
[10.209, 1.558, 1.580],
[11.343, 1.892, 1.855],
[22.693, 4.159, 4.199],
[55.450, 11.146, 11.161],
[2.693, 0.488, 0.491],
[0.769, 0.429, 0.422],
[2.640, 0.572, 0.561],
[9.681, 2.242, 2.267],
[8.752, 5.296, 5.205],
[0.496, 0.403, 0.402],
[2.388, 1.067, 1.064],
[6.067, 1.554, 1.546],
[8.350, 7.786, 7.676],
[11.737, 6.850, 6.943],
[12.001, 7.473, 7.597],
[1.878, 1.764, 1.757],
[0.453, 0.284, 0.283],
[0.167, 0.107, 0.106],
[0.202, 0.136, 0.122],
[0.814, 0.667, 0.642],
[0.133, 0.040, 0.044],
[0.122, 0.034, 0.033],
[0.129, 0.045, 0.044]
[0.043, 0.018, 0.016],
[0.087, 0.031, 0.028],
[0.173, 0.072, 0.073],
[0.356, 0.075, 0.081],
[1.201, 0.784, 0.796],
[0.960, 0.831, 0.837],
[0.057, 0.026, 0.026],
[0.062, 0.029, 0.031],
[1.408, 1.314, 1.315],
[1.302, 1.025, 1.038],
[0.483, 0.280, 0.269],
[0.705, 0.306, 0.296],
[1.137, 0.931, 0.939],
[3.183, 2.245, 2.252],
[1.499, 1.415, 1.429],
[1.011, 0.901, 0.897],
[3.230, 2.670, 2.655],
[3.136, 2.560, 2.539],
[6.849, 5.608, 5.827],
[0.299, 0.075, 0.068],
[10.086, 1.544, 1.617],
[11.238, 1.821, 1.835],
[21.957, 4.104, 4.132],
[55.510, 10.615, 10.548],
[2.678, 0.503, 0.500],
[0.765, 0.412, 0.413],
[2.649, 0.574, 0.559],
[9.652, 2.177, 2.203],
[8.528, 5.051, 5.019],
[0.499, 0.421, 0.439],
[2.389, 1.018, 1.028],
[6.060, 1.520, 1.513],
[8.820, 8.081, 7.826],
[10.604, 4.851, 5.088],
[10.567, 4.971, 4.880],
[1.737, 1.659, 1.649],
[0.363, 0.247, 0.231],
[0.156, 0.093, 0.092],
[0.198, 0.125, 0.124],
[0.902, 0.701, 0.683],
[0.144, 0.042, 0.041],
[0.130, 0.037, 0.040],
[0.131, 0.055, 0.050]
]
}
90 changes: 45 additions & 45 deletions datafusion/results/single.json
Original file line number Diff line number Diff line change
@@ -1,58 +1,58 @@
{
"system": "DataFusion (Parquet, single)",
"date": "2024-03-07",
"date": "2024-07-27",
"machine": "c6a.4xlarge, 500gb gp2",
"cluster_size": 1,
"comment": "v36.0.0 (bf6f83b)",
"comment": "v40.0.0 (4cae813)",

"tags": ["Rust", "column-oriented", "embedded", "stateless"],

"load_time": 0,
"data_size": 14779976446,

"result": [
[0.075, 0.045, 0.048],
[0.105, 0.059, 0.060],
[0.170, 0.100, 0.103],
[0.349, 0.105, 0.107],
[1.145, 0.841, 0.834],
[1.374, 1.251, 1.271],
[0.088, 0.064, 0.063],
[0.091, 0.065, 0.061],
[1.523, 1.442, 1.429],
[1.213, 1.025, 1.014],
[0.413, 0.315, 0.309],
[0.680, 0.334, 0.357],
[1.380, 1.260, 1.255],
[3.382, 2.497, 2.493],
[1.470, 1.392, 1.401],
[1.054, 0.946, 0.947],
[3.158, 2.703, 2.701],
[3.085, 2.615, 2.629],
[6.878, 5.644, 5.705],
[0.336, 0.098, 0.098],
[9.957, 1.526, 1.521],
[11.223, 1.853, 1.881],
[22.175, 4.074, 4.050],
[56.012, 11.500, 11.475],
[2.540, 0.578, 0.588],
[0.768, 0.524, 0.513],
[2.521, 0.670, 0.674],
[9.559, 2.258, 2.247],
[8.948, 5.095, 5.133],
[0.512, 0.460, 0.456],
[2.314, 1.127, 1.124],
[5.812, 1.591, 1.579],
[8.333, 7.788, 7.797],
[11.544, 6.860, 6.871],
[12.007, 7.641, 7.549],
[1.940, 1.815, 1.821],
[0.457, 0.318, 0.313],
[0.222, 0.179, 0.176],
[0.229, 0.178, 0.173],
[0.860, 0.693, 0.704],
[0.157, 0.073, 0.072],
[0.142, 0.069, 0.074],
[0.150, 0.086, 0.077]
[0.076, 0.051, 0.055],
[0.113, 0.066, 0.066],
[0.196, 0.115, 0.105],
[0.340, 0.114, 0.115],
[1.074, 0.862, 0.858],
[0.995, 0.874, 0.909],
[0.088, 0.076, 0.065],
[0.102, 0.078, 0.068],
[1.442, 1.349, 1.368],
[1.260, 1.083, 1.064],
[0.451, 0.306, 0.304],
[0.597, 0.337, 0.335],
[1.088, 0.986, 0.974],
[3.085, 2.261, 2.268],
[1.522, 1.428, 1.429],
[1.068, 0.957, 0.960],
[3.217, 2.702, 2.754],
[3.149, 2.621, 2.564],
[6.978, 5.679, 5.865],
[0.338, 0.107, 0.113],
[9.885, 1.466, 1.474],
[11.225, 1.794, 1.791],
[22.035, 3.906, 3.912],
[55.923, 10.899, 10.975],
[2.560, 0.579, 0.575],
[0.754, 0.509, 0.506],
[2.517, 0.674, 0.651],
[9.574, 2.220, 2.216],
[9.070, 4.926, 4.940],
[0.536, 0.473, 0.481],
[2.288, 1.090, 1.101],
[5.823, 1.543, 1.528],
[8.637, 8.328, 7.848],
[10.477, 4.972, 5.022],
[10.435, 4.910, 5.020],
[1.827, 1.685, 1.724],
[0.389, 0.275, 0.270],
[0.201, 0.175, 0.160],
[0.230, 0.173, 0.172],
[0.887, 0.749, 0.755],
[0.172, 0.085, 0.076],
[0.165, 0.075, 0.073],
[0.160, 0.090, 0.100]
]
}
2 changes: 1 addition & 1 deletion datafusion/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ cat queries.sql | while read query; do
# 2. each query contains a "Query took xxx seconds", we just grep these 2 lines
# 3. use sed to take the second line
# 4. use awk to take the number we want
RES=`datafusion-cli -f $CREATE_SQL_FILE /tmp/query.sql 2>&1 | grep "Query took" | sed -n 2p | awk '{print $7}'`
RES=`datafusion-cli -f $CREATE_SQL_FILE /tmp/query.sql 2>&1 | grep "Elapsed" |sed -n 2p | awk '{ print $2 }'
[[ $RES != "" ]] && \
echo -n "$RES" || \
echo -n "null"
Expand Down
Loading

0 comments on commit bf07cf7

Please sign in to comment.