diff --git a/Chapter_12/Chapter_12.sql b/Chapter_12/Chapter_12.sql index 6230b91..a2f3bc3 100644 --- a/Chapter_12/Chapter_12.sql +++ b/Chapter_12/Chapter_12.sql @@ -1,60 +1,62 @@ --- FIRST EDITION FILE; IGNORE - - - --------------------------------------------------------------- --- Practical SQL: A Beginner's Guide to Storytelling with Data +--------------------------------------------------------------------------- +-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition -- by Anthony DeBarros -- Chapter 12 Code Examples --------------------------------------------------------------- +---------------------------------------------------------------------------- + -- Listing 12-1: Extracting components of a timestamp value using date_part() +SHOW ALL; + SELECT - date_part('year', '2019-12-01 18:37:12 EST'::timestamptz) AS "year", - date_part('month', '2019-12-01 18:37:12 EST'::timestamptz) AS "month", - date_part('day', '2019-12-01 18:37:12 EST'::timestamptz) AS "day", - date_part('hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "hour", - date_part('minute', '2019-12-01 18:37:12 EST'::timestamptz) AS "minute", - date_part('seconds', '2019-12-01 18:37:12 EST'::timestamptz) AS "seconds", - date_part('timezone_hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "tz", - date_part('week', '2019-12-01 18:37:12 EST'::timestamptz) AS "week", - date_part('quarter', '2019-12-01 18:37:12 EST'::timestamptz) AS "quarter", - date_part('epoch', '2019-12-01 18:37:12 EST'::timestamptz) AS "epoch"; + date_part('year', '2022-12-01 18:37:12 EST'::timestamptz) AS year, + date_part('month', '2022-12-01 18:37:12 EST'::timestamptz) AS month, + date_part('day', '2022-12-01 18:37:12 EST'::timestamptz) AS day, + date_part('hour', '2022-12-01 18:37:12 EST'::timestamptz) AS hour, + date_part('minute', '2022-12-01 18:37:12 EST'::timestamptz) AS minute, + date_part('seconds', '2022-12-01 18:37:12 EST'::timestamptz) AS seconds, + date_part('timezone_hour', '2022-12-01 18:37:12 EST'::timestamptz) AS tz, + date_part('week', '2022-12-01 18:37:12 EST'::timestamptz) AS week, + date_part('quarter', '2022-12-01 18:37:12 EST'::timestamptz) AS quarter, + date_part('epoch', '2022-12-01 18:37:12 EST'::timestamptz) AS epoch; + +SELECT '2022-12-01 18:37:12 EST'::timestamp AT TIME ZONE 'US/Pacific' -- Bonus: Using the SQL-standard extract() for similar datetime parsing: -SELECT extract('year' from '2019-12-01 18:37:12 EST'::timestamptz) AS "year"; +SELECT extract('year' from '2022-12-01 18:37:12 EST'::timestamptz) AS year; -- Listing 12-2: Three functions for making datetimes from components -- make a date -SELECT make_date(2018, 2, 22); +SELECT make_date(2022, 2, 22); -- make a time SELECT make_time(18, 4, 30.3); -- make a timestamp with time zone -SELECT make_timestamptz(2018, 2, 22, 18, 4, 30.3, 'Europe/Lisbon'); +SELECT make_timestamptz(2022, 2, 22, 18, 4, 30.3, 'Europe/Lisbon'); -- Bonus: Retrieving the current date and time SELECT + current_timestamp, + localtimestamp, current_date, current_time, - current_timestamp, localtime, - localtimestamp, now(); -- Listing 12-3: Comparing current_timestamp and clock_timestamp() during row insert CREATE TABLE current_time_example ( - time_id bigserial, + time_id integer GENERATED ALWAYS AS IDENTITY, current_timestamp_col timestamp with time zone, clock_timestamp_col timestamp with time zone ); -INSERT INTO current_time_example (current_timestamp_col, clock_timestamp_col) +INSERT INTO current_time_example + (current_timestamp_col, clock_timestamp_col) (SELECT current_timestamp, clock_timestamp() FROM generate_series(1,1000)); @@ -63,32 +65,34 @@ SELECT * FROM current_time_example; -- Time Zones --- Listing 12-4: Showing your PostgreSQL server's default time zone +-- Listing 12-4: Viewing your current time zone setting SHOW timezone; -- Note: You can see all run-time defaults with SHOW ALL; -- Listing 12-5: Showing time zone abbreviations and names -SELECT * FROM pg_timezone_abbrevs; -SELECT * FROM pg_timezone_names; +SELECT * FROM pg_timezone_abbrevs ORDER BY abbrev; +SELECT * FROM pg_timezone_names ORDER BY name; -- Filter to find one SELECT * FROM pg_timezone_names -WHERE name LIKE 'Europe%'; +WHERE name LIKE 'Europe%' +ORDER BY name; -- Listing 12-6: Setting the time zone for a client session -SET timezone TO 'US/Pacific'; +SET TIME ZONE 'US/Pacific'; CREATE TABLE time_zone_test ( test_date timestamp with time zone ); -INSERT INTO time_zone_test VALUES ('2020-01-01 4:00'); +INSERT INTO time_zone_test VALUES ('2023-01-01 4:00'); +INSERT INTO time_zone_test VALUES ('2023-07-01 4:00'); SELECT test_date FROM time_zone_test; -SET timezone TO 'US/Eastern'; +SET TIME ZONE 'US/Eastern'; SELECT test_date FROM time_zone_test; @@ -99,28 +103,28 @@ FROM time_zone_test; -- Math with dates! -SELECT '9/30/1929'::date - '9/27/1929'::date; -SELECT '9/30/1929'::date + '5 years'::interval; +SELECT '1929-09-30'::date - '1929-09-27'::date; +SELECT '1929-09-30'::date + '5 years'::interval; -- Taxi Rides -- Listing 12-7: Creating a table and importing NYC yellow taxi data -CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 ( - trip_id bigserial PRIMARY KEY, - vendor_id varchar(1) NOT NULL, +CREATE TABLE nyc_yellow_taxi_trips ( + trip_id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + vendor_id text NOT NULL, tpep_pickup_datetime timestamp with time zone NOT NULL, tpep_dropoff_datetime timestamp with time zone NOT NULL, passenger_count integer NOT NULL, trip_distance numeric(8,2) NOT NULL, pickup_longitude numeric(18,15) NOT NULL, pickup_latitude numeric(18,15) NOT NULL, - rate_code_id varchar(2) NOT NULL, - store_and_fwd_flag varchar(1) NOT NULL, + rate_code_id text NOT NULL, + store_and_fwd_flag text NOT NULL, dropoff_longitude numeric(18,15) NOT NULL, dropoff_latitude numeric(18,15) NOT NULL, - payment_type varchar(1) NOT NULL, + payment_type text NOT NULL, fare_amount numeric(9,2) NOT NULL, extra numeric(9,2) NOT NULL, mta_tax numeric(5,2) NOT NULL, @@ -130,7 +134,7 @@ CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 ( total_amount numeric(9,2) NOT NULL ); -COPY nyc_yellow_taxi_trips_2016_06_01 ( +COPY nyc_yellow_taxi_trips ( vendor_id, tpep_pickup_datetime, tpep_dropoff_datetime, @@ -151,20 +155,23 @@ COPY nyc_yellow_taxi_trips_2016_06_01 ( improvement_surcharge, total_amount ) -FROM 'C:\YourDirectory\yellow_tripdata_2016_06_01.csv' -WITH (FORMAT CSV, HEADER, DELIMITER ','); +-- FROM 'C:\YourDirectory\nyc_yellow_taxi_trips.csv' +FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_12/nyc_yellow_taxi_trips.csv' +WITH (FORMAT CSV, HEADER); CREATE INDEX tpep_pickup_idx -ON nyc_yellow_taxi_trips_2016_06_01 (tpep_pickup_datetime); +ON nyc_yellow_taxi_trips (tpep_pickup_datetime); -SELECT count(*) FROM nyc_yellow_taxi_trips_2016_06_01; +SELECT count(*) FROM nyc_yellow_taxi_trips; -- Listing 12-8: Counting taxi trips by hour +SET TIME ZONE 'US/Eastern'; -- Set this if your PostgreSQL server defaults to a time zone other than US/Eastern + SELECT date_part('hour', tpep_pickup_datetime) AS trip_hour, count(*) -FROM nyc_yellow_taxi_trips_2016_06_01 +FROM nyc_yellow_taxi_trips GROUP BY trip_hour ORDER BY trip_hour; @@ -174,12 +181,13 @@ COPY (SELECT date_part('hour', tpep_pickup_datetime) AS trip_hour, count(*) - FROM nyc_yellow_taxi_trips_2016_06_01 + FROM nyc_yellow_taxi_trips GROUP BY trip_hour ORDER BY trip_hour ) -TO 'C:\YourDirectory\hourly_pickups_2016_06_01.csv' -WITH (FORMAT CSV, HEADER, DELIMITER ','); +-- TO 'C:\YourDirectory\hourly_taxi_pickups.csv' +TO '/Users/adebarros/Desktop/hourly_taxi_pickups.csv' +WITH (FORMAT CSV, HEADER); -- Listing 12-10: Calculating median trip time by hour @@ -188,29 +196,29 @@ SELECT percentile_cont(.5) WITHIN GROUP (ORDER BY tpep_dropoff_datetime - tpep_pickup_datetime) AS median_trip -FROM nyc_yellow_taxi_trips_2016_06_01 +FROM nyc_yellow_taxi_trips GROUP BY trip_hour ORDER BY trip_hour; -- Listing 12-11: Creating a table to hold train trip data -SET timezone TO 'US/Central'; - CREATE TABLE train_rides ( - trip_id bigserial PRIMARY KEY, - segment varchar(50) NOT NULL, + trip_id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + segment text NOT NULL, departure timestamp with time zone NOT NULL, arrival timestamp with time zone NOT NULL ); INSERT INTO train_rides (segment, departure, arrival) VALUES - ('Chicago to New York', '2017-11-13 21:30 CST', '2017-11-14 18:23 EST'), - ('New York to New Orleans', '2017-11-15 14:15 EST', '2017-11-16 19:32 CST'), - ('New Orleans to Los Angeles', '2017-11-17 13:45 CST', '2017-11-18 9:00 PST'), - ('Los Angeles to San Francisco', '2017-11-19 10:10 PST', '2017-11-19 21:24 PST'), - ('San Francisco to Denver', '2017-11-20 9:10 PST', '2017-11-21 18:38 MST'), - ('Denver to Chicago', '2017-11-22 19:10 MST', '2017-11-23 14:50 CST'); + ('Chicago to New York', '2020-11-13 21:30 CST', '2020-11-14 18:23 EST'), + ('New York to New Orleans', '2020-11-15 14:15 EST', '2020-11-16 19:32 CST'), + ('New Orleans to Los Angeles', '2020-11-17 13:45 CST', '2020-11-18 9:00 PST'), + ('Los Angeles to San Francisco', '2020-11-19 10:10 PST', '2020-11-19 21:24 PST'), + ('San Francisco to Denver', '2020-11-20 9:10 PST', '2020-11-21 18:38 MST'), + ('Denver to Chicago', '2020-11-22 19:10 MST', '2020-11-23 14:50 CST'); + +SET TIME ZONE 'US/Central'; SELECT * FROM train_rides; diff --git a/Chapter_12/yellow_tripdata_2016_06_01.csv b/Chapter_12/nyc_yellow_taxi_trips.csv similarity index 100% rename from Chapter_12/yellow_tripdata_2016_06_01.csv rename to Chapter_12/nyc_yellow_taxi_trips.csv diff --git a/Try_It_Yourself/Try_It_Yourself.sql b/Try_It_Yourself/Try_It_Yourself.sql index 510181e..f92fb02 100644 --- a/Try_It_Yourself/Try_It_Yourself.sql +++ b/Try_It_Yourself/Try_It_Yourself.sql @@ -729,3 +729,70 @@ FROM pls_fy2018_libraries WHERE popu_lsa >= 250000; +---------------------------------------------------------------------------- +-- Chapter 12: Working with Dates and Times +---------------------------------------------------------------------------- + +-- 1. Using the New York City taxi data, calculate the length of each ride using +-- the pickup and drop-off timestamps. Sort the query results from the longest +-- ride to the shortest. Do you notice anything about the longest or shortest +-- trips that you might want to ask city officials about? + +-- Answer: More than 480 of the trips last more than 10 hours, which seems +-- excessive. Moreover, two records have drop-off times before the pickup time, +-- and several have pickup and drop-off times that are the same. It's worth +-- asking whether these records have timestamp errors. + +SELECT + trip_id, + tpep_pickup_datetime, + tpep_dropoff_datetime, + tpep_dropoff_datetime - tpep_pickup_datetime AS length_of_ride +FROM nyc_yellow_taxi_trips +ORDER BY length_of_ride DESC; + +-- 2. Using the AT TIME ZONE keywords, write a query that displays the date and +-- time for London, Johannesburg, Moscow, and Melbourne the moment January 1, +-- 2100, arrives in New York City. + +-- Answer: + +SELECT '2100-01-01 00:00:00-05' AT TIME ZONE 'US/Eastern' AS new_york, + '2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/London' AS london, + '2100-01-01 00:00:00-05' AT TIME ZONE 'Africa/Johannesburg' AS johannesburg, + '2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/Moscow' AS moscow, + '2100-01-01 00:00:00-05' AT TIME ZONE 'Australia/Melbourne' AS melbourne; + +-- 3. As a bonus challenge, use the statistics functions in Chapter 11 to +-- calculate the correlation coefficient and r-squared values using trip time +-- and the total_amount column in the New York City taxi data, which represents +-- total amount charged to passengers. Do the same with trip_distance and +-- total_amount. Limit the query to rides that last three hours or less. + +-- Answer: + +SELECT + round( + corr(total_amount, ( + date_part('epoch', tpep_dropoff_datetime) - + date_part('epoch', tpep_pickup_datetime) + ))::numeric, 2 + ) AS amount_time_corr, + round( + regr_r2(total_amount, ( + date_part('epoch', tpep_dropoff_datetime) - + date_part('epoch', tpep_pickup_datetime) + ))::numeric, 2 + ) AS amount_time_r2, + round( + corr(total_amount, trip_distance)::numeric, 2 + ) AS amount_distance_corr, + round( + regr_r2(total_amount, trip_distance)::numeric, 2 + ) AS amount_distance_r2 +FROM nyc_yellow_taxi_trips +WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval; + +-- Note: Both correlations are strong, with r values of 0.80 or higher. We'd +-- expect this given the cost of a taxi ride is based on both time and distance. +