Chapter 12 code and data

This commit is contained in:
anthonydb 2020-11-08 17:59:17 -05:00
parent 5710d13161
commit 4db62752dc
3 changed files with 134 additions and 59 deletions

View File

@ -1,60 +1,62 @@
-- FIRST EDITION FILE; IGNORE
--------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data
---------------------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
-- by Anthony DeBarros
-- Chapter 12 Code Examples
--------------------------------------------------------------
----------------------------------------------------------------------------
-- Listing 12-1: Extracting components of a timestamp value using date_part()
SHOW ALL;
SELECT
date_part('year', '2019-12-01 18:37:12 EST'::timestamptz) AS "year",
date_part('month', '2019-12-01 18:37:12 EST'::timestamptz) AS "month",
date_part('day', '2019-12-01 18:37:12 EST'::timestamptz) AS "day",
date_part('hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "hour",
date_part('minute', '2019-12-01 18:37:12 EST'::timestamptz) AS "minute",
date_part('seconds', '2019-12-01 18:37:12 EST'::timestamptz) AS "seconds",
date_part('timezone_hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "tz",
date_part('week', '2019-12-01 18:37:12 EST'::timestamptz) AS "week",
date_part('quarter', '2019-12-01 18:37:12 EST'::timestamptz) AS "quarter",
date_part('epoch', '2019-12-01 18:37:12 EST'::timestamptz) AS "epoch";
date_part('year', '2022-12-01 18:37:12 EST'::timestamptz) AS year,
date_part('month', '2022-12-01 18:37:12 EST'::timestamptz) AS month,
date_part('day', '2022-12-01 18:37:12 EST'::timestamptz) AS day,
date_part('hour', '2022-12-01 18:37:12 EST'::timestamptz) AS hour,
date_part('minute', '2022-12-01 18:37:12 EST'::timestamptz) AS minute,
date_part('seconds', '2022-12-01 18:37:12 EST'::timestamptz) AS seconds,
date_part('timezone_hour', '2022-12-01 18:37:12 EST'::timestamptz) AS tz,
date_part('week', '2022-12-01 18:37:12 EST'::timestamptz) AS week,
date_part('quarter', '2022-12-01 18:37:12 EST'::timestamptz) AS quarter,
date_part('epoch', '2022-12-01 18:37:12 EST'::timestamptz) AS epoch;
SELECT '2022-12-01 18:37:12 EST'::timestamp AT TIME ZONE 'US/Pacific'
-- Bonus: Using the SQL-standard extract() for similar datetime parsing:
SELECT extract('year' from '2019-12-01 18:37:12 EST'::timestamptz) AS "year";
SELECT extract('year' from '2022-12-01 18:37:12 EST'::timestamptz) AS year;
-- Listing 12-2: Three functions for making datetimes from components
-- make a date
SELECT make_date(2018, 2, 22);
SELECT make_date(2022, 2, 22);
-- make a time
SELECT make_time(18, 4, 30.3);
-- make a timestamp with time zone
SELECT make_timestamptz(2018, 2, 22, 18, 4, 30.3, 'Europe/Lisbon');
SELECT make_timestamptz(2022, 2, 22, 18, 4, 30.3, 'Europe/Lisbon');
-- Bonus: Retrieving the current date and time
SELECT
current_timestamp,
localtimestamp,
current_date,
current_time,
current_timestamp,
localtime,
localtimestamp,
now();
-- Listing 12-3: Comparing current_timestamp and clock_timestamp() during row insert
CREATE TABLE current_time_example (
time_id bigserial,
time_id integer GENERATED ALWAYS AS IDENTITY,
current_timestamp_col timestamp with time zone,
clock_timestamp_col timestamp with time zone
);
INSERT INTO current_time_example (current_timestamp_col, clock_timestamp_col)
INSERT INTO current_time_example
(current_timestamp_col, clock_timestamp_col)
(SELECT current_timestamp,
clock_timestamp()
FROM generate_series(1,1000));
@ -63,32 +65,34 @@ SELECT * FROM current_time_example;
-- Time Zones
-- Listing 12-4: Showing your PostgreSQL server's default time zone
-- Listing 12-4: Viewing your current time zone setting
SHOW timezone; -- Note: You can see all run-time defaults with SHOW ALL;
-- Listing 12-5: Showing time zone abbreviations and names
SELECT * FROM pg_timezone_abbrevs;
SELECT * FROM pg_timezone_names;
SELECT * FROM pg_timezone_abbrevs ORDER BY abbrev;
SELECT * FROM pg_timezone_names ORDER BY name;
-- Filter to find one
SELECT * FROM pg_timezone_names
WHERE name LIKE 'Europe%';
WHERE name LIKE 'Europe%'
ORDER BY name;
-- Listing 12-6: Setting the time zone for a client session
SET timezone TO 'US/Pacific';
SET TIME ZONE 'US/Pacific';
CREATE TABLE time_zone_test (
test_date timestamp with time zone
);
INSERT INTO time_zone_test VALUES ('2020-01-01 4:00');
INSERT INTO time_zone_test VALUES ('2023-01-01 4:00');
INSERT INTO time_zone_test VALUES ('2023-07-01 4:00');
SELECT test_date
FROM time_zone_test;
SET timezone TO 'US/Eastern';
SET TIME ZONE 'US/Eastern';
SELECT test_date
FROM time_zone_test;
@ -99,28 +103,28 @@ FROM time_zone_test;
-- Math with dates!
SELECT '9/30/1929'::date - '9/27/1929'::date;
SELECT '9/30/1929'::date + '5 years'::interval;
SELECT '1929-09-30'::date - '1929-09-27'::date;
SELECT '1929-09-30'::date + '5 years'::interval;
-- Taxi Rides
-- Listing 12-7: Creating a table and importing NYC yellow taxi data
CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 (
trip_id bigserial PRIMARY KEY,
vendor_id varchar(1) NOT NULL,
CREATE TABLE nyc_yellow_taxi_trips (
trip_id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
vendor_id text NOT NULL,
tpep_pickup_datetime timestamp with time zone NOT NULL,
tpep_dropoff_datetime timestamp with time zone NOT NULL,
passenger_count integer NOT NULL,
trip_distance numeric(8,2) NOT NULL,
pickup_longitude numeric(18,15) NOT NULL,
pickup_latitude numeric(18,15) NOT NULL,
rate_code_id varchar(2) NOT NULL,
store_and_fwd_flag varchar(1) NOT NULL,
rate_code_id text NOT NULL,
store_and_fwd_flag text NOT NULL,
dropoff_longitude numeric(18,15) NOT NULL,
dropoff_latitude numeric(18,15) NOT NULL,
payment_type varchar(1) NOT NULL,
payment_type text NOT NULL,
fare_amount numeric(9,2) NOT NULL,
extra numeric(9,2) NOT NULL,
mta_tax numeric(5,2) NOT NULL,
@ -130,7 +134,7 @@ CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 (
total_amount numeric(9,2) NOT NULL
);
COPY nyc_yellow_taxi_trips_2016_06_01 (
COPY nyc_yellow_taxi_trips (
vendor_id,
tpep_pickup_datetime,
tpep_dropoff_datetime,
@ -151,20 +155,23 @@ COPY nyc_yellow_taxi_trips_2016_06_01 (
improvement_surcharge,
total_amount
)
FROM 'C:\YourDirectory\yellow_tripdata_2016_06_01.csv'
WITH (FORMAT CSV, HEADER, DELIMITER ',');
-- FROM 'C:\YourDirectory\nyc_yellow_taxi_trips.csv'
FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_12/nyc_yellow_taxi_trips.csv'
WITH (FORMAT CSV, HEADER);
CREATE INDEX tpep_pickup_idx
ON nyc_yellow_taxi_trips_2016_06_01 (tpep_pickup_datetime);
ON nyc_yellow_taxi_trips (tpep_pickup_datetime);
SELECT count(*) FROM nyc_yellow_taxi_trips_2016_06_01;
SELECT count(*) FROM nyc_yellow_taxi_trips;
-- Listing 12-8: Counting taxi trips by hour
SET TIME ZONE 'US/Eastern'; -- Set this if your PostgreSQL server defaults to a time zone other than US/Eastern
SELECT
date_part('hour', tpep_pickup_datetime) AS trip_hour,
count(*)
FROM nyc_yellow_taxi_trips_2016_06_01
FROM nyc_yellow_taxi_trips
GROUP BY trip_hour
ORDER BY trip_hour;
@ -174,12 +181,13 @@ COPY
(SELECT
date_part('hour', tpep_pickup_datetime) AS trip_hour,
count(*)
FROM nyc_yellow_taxi_trips_2016_06_01
FROM nyc_yellow_taxi_trips
GROUP BY trip_hour
ORDER BY trip_hour
)
TO 'C:\YourDirectory\hourly_pickups_2016_06_01.csv'
WITH (FORMAT CSV, HEADER, DELIMITER ',');
-- TO 'C:\YourDirectory\hourly_taxi_pickups.csv'
TO '/Users/adebarros/Desktop/hourly_taxi_pickups.csv'
WITH (FORMAT CSV, HEADER);
-- Listing 12-10: Calculating median trip time by hour
@ -188,29 +196,29 @@ SELECT
percentile_cont(.5)
WITHIN GROUP (ORDER BY
tpep_dropoff_datetime - tpep_pickup_datetime) AS median_trip
FROM nyc_yellow_taxi_trips_2016_06_01
FROM nyc_yellow_taxi_trips
GROUP BY trip_hour
ORDER BY trip_hour;
-- Listing 12-11: Creating a table to hold train trip data
SET timezone TO 'US/Central';
CREATE TABLE train_rides (
trip_id bigserial PRIMARY KEY,
segment varchar(50) NOT NULL,
trip_id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
segment text NOT NULL,
departure timestamp with time zone NOT NULL,
arrival timestamp with time zone NOT NULL
);
INSERT INTO train_rides (segment, departure, arrival)
VALUES
('Chicago to New York', '2017-11-13 21:30 CST', '2017-11-14 18:23 EST'),
('New York to New Orleans', '2017-11-15 14:15 EST', '2017-11-16 19:32 CST'),
('New Orleans to Los Angeles', '2017-11-17 13:45 CST', '2017-11-18 9:00 PST'),
('Los Angeles to San Francisco', '2017-11-19 10:10 PST', '2017-11-19 21:24 PST'),
('San Francisco to Denver', '2017-11-20 9:10 PST', '2017-11-21 18:38 MST'),
('Denver to Chicago', '2017-11-22 19:10 MST', '2017-11-23 14:50 CST');
('Chicago to New York', '2020-11-13 21:30 CST', '2020-11-14 18:23 EST'),
('New York to New Orleans', '2020-11-15 14:15 EST', '2020-11-16 19:32 CST'),
('New Orleans to Los Angeles', '2020-11-17 13:45 CST', '2020-11-18 9:00 PST'),
('Los Angeles to San Francisco', '2020-11-19 10:10 PST', '2020-11-19 21:24 PST'),
('San Francisco to Denver', '2020-11-20 9:10 PST', '2020-11-21 18:38 MST'),
('Denver to Chicago', '2020-11-22 19:10 MST', '2020-11-23 14:50 CST');
SET TIME ZONE 'US/Central';
SELECT * FROM train_rides;

View File

Can't render this file because it is too large.

View File

@ -729,3 +729,70 @@ FROM pls_fy2018_libraries
WHERE popu_lsa >= 250000;
----------------------------------------------------------------------------
-- Chapter 12: Working with Dates and Times
----------------------------------------------------------------------------
-- 1. Using the New York City taxi data, calculate the length of each ride using
-- the pickup and drop-off timestamps. Sort the query results from the longest
-- ride to the shortest. Do you notice anything about the longest or shortest
-- trips that you might want to ask city officials about?
-- Answer: More than 480 of the trips last more than 10 hours, which seems
-- excessive. Moreover, two records have drop-off times before the pickup time,
-- and several have pickup and drop-off times that are the same. It's worth
-- asking whether these records have timestamp errors.
SELECT
trip_id,
tpep_pickup_datetime,
tpep_dropoff_datetime,
tpep_dropoff_datetime - tpep_pickup_datetime AS length_of_ride
FROM nyc_yellow_taxi_trips
ORDER BY length_of_ride DESC;
-- 2. Using the AT TIME ZONE keywords, write a query that displays the date and
-- time for London, Johannesburg, Moscow, and Melbourne the moment January 1,
-- 2100, arrives in New York City.
-- Answer:
SELECT '2100-01-01 00:00:00-05' AT TIME ZONE 'US/Eastern' AS new_york,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/London' AS london,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Africa/Johannesburg' AS johannesburg,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/Moscow' AS moscow,
'2100-01-01 00:00:00-05' AT TIME ZONE 'Australia/Melbourne' AS melbourne;
-- 3. As a bonus challenge, use the statistics functions in Chapter 11 to
-- calculate the correlation coefficient and r-squared values using trip time
-- and the total_amount column in the New York City taxi data, which represents
-- total amount charged to passengers. Do the same with trip_distance and
-- total_amount. Limit the query to rides that last three hours or less.
-- Answer:
SELECT
round(
corr(total_amount, (
date_part('epoch', tpep_dropoff_datetime) -
date_part('epoch', tpep_pickup_datetime)
))::numeric, 2
) AS amount_time_corr,
round(
regr_r2(total_amount, (
date_part('epoch', tpep_dropoff_datetime) -
date_part('epoch', tpep_pickup_datetime)
))::numeric, 2
) AS amount_time_r2,
round(
corr(total_amount, trip_distance)::numeric, 2
) AS amount_distance_corr,
round(
regr_r2(total_amount, trip_distance)::numeric, 2
) AS amount_distance_r2
FROM nyc_yellow_taxi_trips
WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval;
-- Note: Both correlations are strong, with r values of 0.80 or higher. We'd
-- expect this given the cost of a taxi ride is based on both time and distance.