<  *  | *** >

Taxi benchmark 

  • data model
  • pre requisite
  • download data
  • insert data
  • full load

-- ###########################
-- RUNNING clear.sh

/*
sure mode, no confirmation prompt
clearing directory INTERNAL_FILES_DIR (../STORAGE/INT) ...
done
clearing directory MAX_PERF_FILES_DIR (../STORAGE/MAX_PERF) ...
done
clearing directory TRANSAC_FILES_DIR (../STORAGE/TRANSAC) ...
done
*/

-- ###########################
-- RUNNING doc_bench.sql
--

Taxi benchmark

data model

stop_on_error;
reponse
success

create col_type t_location_id as text;
reponse
success

create col_type t_pickup_location_id as text;
reponse
success

create col_type t_dropoff_location_id as text;
reponse
success

create col_type t_date as text;
reponse
success

create big table trips (
  cab_type_id text,
  vendor_id text,
  pickup_date t_date,
  pickup_time text,
  dropoff_date text,
  dropoff_time text,
  store_and_fwd_flag text,
  rate_code_id text,
  pickup_longitude number,
  pickup_latitude number,
  dropoff_longitude number,
  dropoff_latitude number,
  passenger_count number,
  trip_distance number,
  fare_amount number,
  extra number,
  mta_tax number,
  tip_amount number,
  tolls_amount number,
  ehail_fee number,
  improvement_surcharge number,
  total_amount number,
  payment_type text,
  trip_type text,
  pickup_nyct2010_gid text,
  dropoff_nyct2010_gid text,
  pickup_location_id t_pickup_location_id,
  dropoff_location_id t_dropoff_location_id
);
reponse
success

create merge table pickup_location(
  pickup_location_id t_pickup_location_id,
  pickup_location_id2 t_location_id
);
reponse
success

create merge table dropoff_location(
  dropoff_location_id t_dropoff_location_id,
  dropoff_location_id2 t_location_id
);
reponse
success

create merge table calendar(
date t_date,
week_day text,
month_name text,
full_year text
);
reponse
success

create view v_trips as
  select * from trips, pickup_location, dropoff_location, calendar
  where pickup_location.pickup_location_id=trips.pickup_location_id
  and dropoff_location.dropoff_location_id=trips.dropoff_location_id
  and calendar.date=trips.pickup_date
;
reponse
success

pre requisite

You will need around 250 GB of RAM for the full dataset
Disable swap
swapoff -a
Disable THP (=Transparent Huge Page) if activated
On Centos (as root)
echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
On Redhat (as root)
echo never > /sys/kernel/mm/redhat_transparent_hugepage/enabled
echo never > /sys/kernel/mm/redhat_transparent_hugepage/defrag

download data

Https://www.stormbase.fr/downloads/taxi_small.zip

insert data

set file_separator=';';
reponse
success

insert into pickup_location select * from file('pickup_location.csv');
reponse
success

insert into dropoff_location select * from file('dropoff_location.csv');
reponse
success

insert into calendar select * from file('calendar.csv');
reponse
success

insert into trips select * from file('trips.csv') where rownum<10;
reponse
success

refresh dirty views;
reponse
success

save;
reponse
success

desc table;
table_namecolumn_countline_counthas_deletehas_updateparent_view_hidden
trips289nnn
pickup_location2202nnn
dropoff_location2255nnn
calendar44nnn

desc view;
view_nametable_countrel_countis_dirtyis_mp_dirtyis_partialfirst_ilinemp_rare_countmp_often_countmp_countr_online_countmp_r_online_count
v_trips49nnn105141900

select * from v_trips;
cab_type_idvendor_idpickup_datepickup_timedropoff_datedropoff_timestore_and_fwd_flagrate_code_idpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudepassenger_counttrip_distancefare_amountextramta_taxtip_amounttolls_amountehail_feeimprovement_surchargetotal_amountpayment_typetrip_typepickup_nyct2010_giddropoff_nyct2010_gidpickup_location_iddropoff_location_idpickup_location_idpickup_location_id2dropoff_location_iddropoff_location_id2dateweek_daymonth_namefull_year
222016080405:002016080405:16N1000012.130120.5000.5000000.30013.30021##12922312912922322320160804ThursdayAugust2016
222016080405:132016080405:14N1000010.1202.5000.5000.50010000.30013.80011##24424424424424424420160804ThursdayAugust2016
222016080405:312016080405:45N1000012.440120.5000.5000000.30013.30011##18122818118122822820160804ThursdayAugust2016
222016080405:082016080405:14N1000010.83060.5000.5000000.3007.30021##41166414116616620160804ThursdayAugust2016
222016080405:462016080406:05N1000015.62018.5000.5000.5000000.30019.80021##71647716416420160804ThursdayAugust2016
222016080405:592016080405:59N50000101000000112##21212121212120160804ThursdayAugust2016
222016080405:192016080405:30N1000012.140100.5000.5000000.30011.30021##11222511211222522520160804ThursdayAugust2016
222016080405:422016080405:50N1000011.3407.5000.5000.5000000.3008.80021##82173828217317320160804ThursdayAugust2016
222016080405:172016080405:21N1000011.42060.5000.5001.460000.3008.76011##16611616616611611620160804ThursdayAugust2016

full load

--important parameters

../../../40_BENCH/nyc_files/stormbase.conf

MAX_PERF_USE_COMPRESSION:y
CSV_FILES_DIR:/home/postgres/CSV
FILE_SEPARATOR:;

--script

../../../40_BENCH/nyc_files/insert_all.sql

create col_type t_location_id as text;
create col_type t_pickup_location_id as text;
create col_type t_dropoff_location_id as text;
create col_type t_date as text;

create big table trips (
  cab_type_id text,
  vendor_id text,
  pickup_date t_date,
  pickup_time text,
  dropoff_date text,
  dropoff_time text,
  store_and_fwd_flag text,
  rate_code_id text,
  pickup_longitude number,
  pickup_latitude number,
  dropoff_longitude number,
  dropoff_latitude number,
  passenger_count number,
  trip_distance number,
  fare_amount number,
  extra number,
  mta_tax number,
  tip_amount number,
  tolls_amount number,
  ehail_fee number,
  improvement_surcharge number,
  total_amount number,
  payment_type text,
  trip_type text,
  pickup_nyct2010_gid text,
  dropoff_nyct2010_gid text,
  pickup_location_id t_pickup_location_id,
  dropoff_location_id t_dropoff_location_id
);

create merge table pickup_location(
  pickup_location_id t_pickup_location_id,
  pickup_location_id2 t_location_id
);

create merge table dropoff_location(
  dropoff_location_id t_dropoff_location_id,
  dropoff_location_id2 t_location_id
);

create merge table calendar(
date t_date,
week_day text,
month_name text,
full_year text
);

create view v_calendar as select * from calendar;
create view v_pickup_location as select * from pickup_location;
create view v_dropoff_location as select * from dropoff_location;

create view v_trips as
  select * from trips, pickup_location, dropoff_location, calendar
  where pickup_location.pickup_location_id=trips.pickup_location_id
  and dropoff_location.dropoff_location_id=trips.dropoff_location_id
  and calendar.date=trips.pickup_date
;

insert into pickup_location select * from file('pickup_location.csv');
insert into dropoff_location select * from file('dropoff_location.csv');
insert into calendar select * from file('calendar.csv');

insert into trips select * from file('trips_2009.csv');
insert into trips select * from file('trips_2010.csv');
insert into trips select * from file('trips_2011.csv');
insert into trips select * from file('trips_2012.csv');
insert into trips select * from file('trips_2013.csv');
insert into trips select * from file('trips_2014.csv');
insert into trips select * from file('trips_2015.csv');
insert into trips select * from file('trips_2016.csv');
insert into trips select * from file('trips_2017.csv');
insert into trips select * from file('trips_2018.csv');

save;
refresh dirty view;

save;


--logs

../../../40_BENCH/nyc_files/insert_all.log

3319
[2021-01-22 09:40:24] --------- NEW CONNECTION ------------ CACHE SIZE : 0 / 0
[2021-01-22 09:40:24] connecting to localhost : 3319
[2021-01-22 09:40:24] -> authentication;
[2021-01-22 09:40:24] <- data ( 2 ms )
[2021-01-22 09:40:24] -> create col_type t_location_id as text;
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create col_type t_pickup_location_id as text;
[2021-01-22 09:40:24] <- data ( 100 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create col_type t_dropoff_location_id as text;
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create col_type t_date as text;
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create big table trips (
  cab_type_id text,
  vendor_id text,
  pickup_date t_date,
  pickup_time text,
  dropoff_date text,
  dropoff_time text,
  store_and_fwd_flag text,
  rate_code_id text,
  pickup_longitude number,
  pickup_latitude number,
  dropoff_longitude number,
  dropoff_latitude number,
  passenger_count number,
  trip_distance number,
  fare_amount number,
  extra number,
  mta_tax number,
  tip_amount number,
  tolls_amount number,
  ehail_fee number,
  improvement_surcharge number,
  total_amount number,
  payment_type text,
  trip_type text,
  pickup_nyct2010_gid text,
  dropoff_nyct2010_gid text,
  pickup_location_id t_pickup_location_id,
  dropoff_location_id t_dropoff_location_id
);
[2021-01-22 09:40:24] <- data ( 175 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create merge table pickup_location(
  pickup_location_id t_pickup_location_id,
  pickup_location_id2 t_location_id
);
[2021-01-22 09:40:24] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create merge table dropoff_location(
  dropoff_location_id t_dropoff_location_id,
  dropoff_location_id2 t_location_id
);
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create merge table calendar(
date t_date,
week_day text,
month_name text,
full_year text
);
[2021-01-22 09:40:24] <- data ( 103 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create view v_calendar as select * from calendar;
[2021-01-22 09:40:25] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> create view v_pickup_location as select * from pickup_location;
[2021-01-22 09:40:25] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> create view v_dropoff_location as select * from dropoff_location;
[2021-01-22 09:40:25] <- data ( 103 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] ->
create view v_trips as
  select * from trips, pickup_location, dropoff_location, calendar
  where pickup_location.pickup_location_id=trips.pickup_location_id
  and dropoff_location.dropoff_location_id=trips.dropoff_location_id
  and calendar.date=trips.pickup_date;
[2021-01-22 09:40:25] <- data ( 104 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into pickup_location select * from file('pickup_location.csv');
[2021-01-22 09:40:25] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into dropoff_location select * from file('dropoff_location.csv');
[2021-01-22 09:40:25] <- data ( 103 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into calendar select * from file('calendar.csv');
[2021-01-22 09:40:25] <- data ( 262 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into trips select * from file('trips_2009.csv');
[2021-01-22 09:52:10] <- data ( 704998 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:52:10] -> insert into trips select * from file('trips_2010.csv');
[2021-01-22 10:03:04] <- data ( 653457 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:03:04] -> insert into trips select * from file('trips_2011.csv');
[2021-01-22 10:16:52] <- data ( 828521 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:16:52] -> insert into trips select * from file('trips_2012.csv');
[2021-01-22 10:30:13] <- data ( 801036 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:30:13] -> insert into trips select * from file('trips_2013.csv');
[2021-01-22 10:42:40] <- data ( 746230 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:42:40] -> insert into trips select * from file('trips_2014.csv');
[2021-01-22 10:55:55] <- data ( 795400 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:55:55] -> insert into trips select * from file('trips_2015.csv');
[2021-01-22 11:07:49] <- data ( 714364 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:07:49] -> insert into trips select * from file('trips_2016.csv');
[2021-01-22 11:16:18] <- data ( 508225 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:16:18] -> insert into trips select * from file('trips_2017.csv');
[2021-01-22 11:21:55] <- data ( 337340 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:21:55] -> insert into trips select * from file('trips_2018.csv');
[2021-01-22 11:26:57] <- data ( 301737 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:26:57] -> save;
[2021-01-22 12:04:44] <- data ( 2266823 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 12:04:44] -> refresh dirty view;
SIGHUP received (ignored)
[2021-01-22 12:35:21] <- data ( 1837801 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 12:35:22] -> save;
[2021-01-22 12:50:19] <- data ( 897616 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1

--size info

../../../40_BENCH/nyc_files/size.log

bash-4.2$ ./size.sh
CSV_FILES_DIR /home/postgres/CSV 260782276 KB
INTERNAL_FILES_DIR ../INT 197165912 KB
MAX_PERF_FILES_DIR ../MAX_PERF 91560016 KB
TMP_FILES_DIR ../TMP 4 KB

-- ###########################
-- RUNNING shutdown.sql
shutdown;