You will need around 250 GB of RAM for the full dataset
Disable swap
swapoff -a
Disable THP (=Transparent Huge Page) if activated
On Centos (as
root)
echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
On Redhat (as
root)
echo never > /sys/kernel/mm/redhat_transparent_hugepage/enabled
echo never > /sys/kernel/mm/redhat_transparent_hugepage/defrag
../../../40_BENCH/nyc_files/stormbase.conf
MAX_PERF_USE_COMPRESSION:y
CSV_FILES_DIR:/home/postgres/CSV
FILE_SEPARATOR:;
../../../40_BENCH/nyc_files/insert_all.sql
create col_type t_location_id as text;
create col_type t_pickup_location_id as text;
create col_type t_dropoff_location_id as text;
create col_type t_date as text;
create big table trips (
 cab_type_id text,
 vendor_id text,
 pickup_date t_date,
 pickup_time text,
 dropoff_date text,
 dropoff_time text,
 store_and_fwd_flag text,
 rate_code_id text,
 pickup_longitude number,
 pickup_latitude number,
 dropoff_longitude number,
 dropoff_latitude number,
 passenger_count number,
 trip_distance number,
 fare_amount number,
 extra number,
 mta_tax number,
 tip_amount number,
 tolls_amount number,
 ehail_fee number,
 improvement_surcharge number,
 total_amount number,
 payment_type text,
 trip_type text,
 pickup_nyct2010_gid text,
 dropoff_nyct2010_gid text,
 pickup_location_id t_pickup_location_id,
 dropoff_location_id t_dropoff_location_id
);
create merge table pickup_location(
 pickup_location_id t_pickup_location_id,
 pickup_location_id2 t_location_id
);
create merge table dropoff_location(
 dropoff_location_id t_dropoff_location_id,
 dropoff_location_id2 t_location_id
);
create merge table calendar(
date t_date,
week_day text,
month_name text,
full_year text
);
create view v_calendar as select * from calendar;
create view v_pickup_location as select * from pickup_location;
create view v_dropoff_location as select * from dropoff_location;
create view v_trips as
 select * from trips, pickup_location, dropoff_location, calendar
 where pickup_location.pickup_location_id=trips.pickup_location_id
 and dropoff_location.dropoff_location_id=trips.dropoff_location_id
 and calendar.date=trips.pickup_date
;
insert into pickup_location select * from file('pickup_location.csv');
insert into dropoff_location select * from file('dropoff_location.csv');
insert into calendar select * from file('calendar.csv');
insert into trips select * from file('trips_2009.csv');
insert into trips select * from file('trips_2010.csv');
insert into trips select * from file('trips_2011.csv');
insert into trips select * from file('trips_2012.csv');
insert into trips select * from file('trips_2013.csv');
insert into trips select * from file('trips_2014.csv');
insert into trips select * from file('trips_2015.csv');
insert into trips select * from file('trips_2016.csv');
insert into trips select * from file('trips_2017.csv');
insert into trips select * from file('trips_2018.csv');
save;
refresh dirty view;
save;
../../../40_BENCH/nyc_files/insert_all.log
3319
[2021-01-22 09:40:24] --------- NEW CONNECTION ------------ CACHE SIZE : 0 / 0
[2021-01-22 09:40:24] connecting to localhost : 3319
[2021-01-22 09:40:24] -> authentication;
[2021-01-22 09:40:24] <- data ( 2 ms )
[2021-01-22 09:40:24] -> create col_type t_location_id as text;
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create col_type t_pickup_location_id as text;
[2021-01-22 09:40:24] <- data ( 100 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create col_type t_dropoff_location_id as text;
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create col_type t_date as text;
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create big table trips (
 cab_type_id text,
 vendor_id text,
 pickup_date t_date,
 pickup_time text,
 dropoff_date text,
 dropoff_time text,
 store_and_fwd_flag text,
 rate_code_id text,
 pickup_longitude number,
 pickup_latitude number,
 dropoff_longitude number,
 dropoff_latitude number,
 passenger_count number,
 trip_distance number,
 fare_amount number,
 extra number,
 mta_tax number,
 tip_amount number,
 tolls_amount number,
 ehail_fee number,
 improvement_surcharge number,
 total_amount number,
 payment_type text,
 trip_type text,
 pickup_nyct2010_gid text,
 dropoff_nyct2010_gid text,
 pickup_location_id t_pickup_location_id,
 dropoff_location_id t_dropoff_location_id
);
[2021-01-22 09:40:24] <- data ( 175 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create merge table pickup_location(
 pickup_location_id t_pickup_location_id,
 pickup_location_id2 t_location_id
);
[2021-01-22 09:40:24] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create merge table dropoff_location(
 dropoff_location_id t_dropoff_location_id,
 dropoff_location_id2 t_location_id
);
[2021-01-22 09:40:24] <- data ( 101 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] ->
create merge table calendar(
date t_date,
week_day text,
month_name text,
full_year text
);
[2021-01-22 09:40:24] <- data ( 103 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:24] -> create view v_calendar as select * from calendar;
[2021-01-22 09:40:25] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> create view v_pickup_location as select * from pickup_location;
[2021-01-22 09:40:25] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> create view v_dropoff_location as select * from dropoff_location;
[2021-01-22 09:40:25] <- data ( 103 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] ->
create view v_trips as
 select * from trips, pickup_location, dropoff_location, calendar
 where pickup_location.pickup_location_id=trips.pickup_location_id
 and dropoff_location.dropoff_location_id=trips.dropoff_location_id
 and calendar.date=trips.pickup_date;
[2021-01-22 09:40:25] <- data ( 104 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into pickup_location select * from file('pickup_location.csv');
[2021-01-22 09:40:25] <- data ( 102 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into dropoff_location select * from file('dropoff_location.csv');
[2021-01-22 09:40:25] <- data ( 103 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into calendar select * from file('calendar.csv');
[2021-01-22 09:40:25] <- data ( 262 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:40:25] -> insert into trips select * from file('trips_2009.csv');
[2021-01-22 09:52:10] <- data ( 704998 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 09:52:10] -> insert into trips select * from file('trips_2010.csv');
[2021-01-22 10:03:04] <- data ( 653457 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:03:04] -> insert into trips select * from file('trips_2011.csv');
[2021-01-22 10:16:52] <- data ( 828521 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:16:52] -> insert into trips select * from file('trips_2012.csv');
[2021-01-22 10:30:13] <- data ( 801036 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:30:13] -> insert into trips select * from file('trips_2013.csv');
[2021-01-22 10:42:40] <- data ( 746230 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:42:40] -> insert into trips select * from file('trips_2014.csv');
[2021-01-22 10:55:55] <- data ( 795400 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 10:55:55] -> insert into trips select * from file('trips_2015.csv');
[2021-01-22 11:07:49] <- data ( 714364 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:07:49] -> insert into trips select * from file('trips_2016.csv');
[2021-01-22 11:16:18] <- data ( 508225 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:16:18] -> insert into trips select * from file('trips_2017.csv');
[2021-01-22 11:21:55] <- data ( 337340 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:21:55] -> insert into trips select * from file('trips_2018.csv');
[2021-01-22 11:26:57] <- data ( 301737 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 11:26:57] -> save;
[2021-01-22 12:04:44] <- data ( 2266823 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 12:04:44] -> refresh dirty view;
SIGHUP received (ignored)
[2021-01-22 12:35:21] <- data ( 1837801 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
[2021-01-22 12:35:22] -> save;
[2021-01-22 12:50:19] <- data ( 897616 ms )
| reponse|
|--------------------------------------------------|
| success|
ROWCOUNT : 1
../../../40_BENCH/nyc_files/size.log
bash-4.2$ ./size.sh
CSV_FILES_DIR /home/postgres/CSV 260782276 KB
INTERNAL_FILES_DIR ../INT 197165912 KB
MAX_PERF_FILES_DIR ../MAX_PERF 91560016 KB
TMP_FILES_DIR ../TMP 4 KB
shutdown
;