Chapter 12 code and data
This commit is contained in:
parent
5710d13161
commit
4db62752dc
@ -1,60 +1,62 @@
|
||||
-- FIRST EDITION FILE; IGNORE
|
||||
|
||||
|
||||
|
||||
--------------------------------------------------------------
|
||||
-- Practical SQL: A Beginner's Guide to Storytelling with Data
|
||||
---------------------------------------------------------------------------
|
||||
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
|
||||
-- by Anthony DeBarros
|
||||
|
||||
-- Chapter 12 Code Examples
|
||||
--------------------------------------------------------------
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
|
||||
-- Listing 12-1: Extracting components of a timestamp value using date_part()
|
||||
|
||||
SHOW ALL;
|
||||
|
||||
SELECT
|
||||
date_part('year', '2019-12-01 18:37:12 EST'::timestamptz) AS "year",
|
||||
date_part('month', '2019-12-01 18:37:12 EST'::timestamptz) AS "month",
|
||||
date_part('day', '2019-12-01 18:37:12 EST'::timestamptz) AS "day",
|
||||
date_part('hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "hour",
|
||||
date_part('minute', '2019-12-01 18:37:12 EST'::timestamptz) AS "minute",
|
||||
date_part('seconds', '2019-12-01 18:37:12 EST'::timestamptz) AS "seconds",
|
||||
date_part('timezone_hour', '2019-12-01 18:37:12 EST'::timestamptz) AS "tz",
|
||||
date_part('week', '2019-12-01 18:37:12 EST'::timestamptz) AS "week",
|
||||
date_part('quarter', '2019-12-01 18:37:12 EST'::timestamptz) AS "quarter",
|
||||
date_part('epoch', '2019-12-01 18:37:12 EST'::timestamptz) AS "epoch";
|
||||
date_part('year', '2022-12-01 18:37:12 EST'::timestamptz) AS year,
|
||||
date_part('month', '2022-12-01 18:37:12 EST'::timestamptz) AS month,
|
||||
date_part('day', '2022-12-01 18:37:12 EST'::timestamptz) AS day,
|
||||
date_part('hour', '2022-12-01 18:37:12 EST'::timestamptz) AS hour,
|
||||
date_part('minute', '2022-12-01 18:37:12 EST'::timestamptz) AS minute,
|
||||
date_part('seconds', '2022-12-01 18:37:12 EST'::timestamptz) AS seconds,
|
||||
date_part('timezone_hour', '2022-12-01 18:37:12 EST'::timestamptz) AS tz,
|
||||
date_part('week', '2022-12-01 18:37:12 EST'::timestamptz) AS week,
|
||||
date_part('quarter', '2022-12-01 18:37:12 EST'::timestamptz) AS quarter,
|
||||
date_part('epoch', '2022-12-01 18:37:12 EST'::timestamptz) AS epoch;
|
||||
|
||||
SELECT '2022-12-01 18:37:12 EST'::timestamp AT TIME ZONE 'US/Pacific'
|
||||
|
||||
-- Bonus: Using the SQL-standard extract() for similar datetime parsing:
|
||||
|
||||
SELECT extract('year' from '2019-12-01 18:37:12 EST'::timestamptz) AS "year";
|
||||
SELECT extract('year' from '2022-12-01 18:37:12 EST'::timestamptz) AS year;
|
||||
|
||||
-- Listing 12-2: Three functions for making datetimes from components
|
||||
|
||||
-- make a date
|
||||
SELECT make_date(2018, 2, 22);
|
||||
SELECT make_date(2022, 2, 22);
|
||||
-- make a time
|
||||
SELECT make_time(18, 4, 30.3);
|
||||
-- make a timestamp with time zone
|
||||
SELECT make_timestamptz(2018, 2, 22, 18, 4, 30.3, 'Europe/Lisbon');
|
||||
SELECT make_timestamptz(2022, 2, 22, 18, 4, 30.3, 'Europe/Lisbon');
|
||||
|
||||
-- Bonus: Retrieving the current date and time
|
||||
|
||||
SELECT
|
||||
current_timestamp,
|
||||
localtimestamp,
|
||||
current_date,
|
||||
current_time,
|
||||
current_timestamp,
|
||||
localtime,
|
||||
localtimestamp,
|
||||
now();
|
||||
|
||||
-- Listing 12-3: Comparing current_timestamp and clock_timestamp() during row insert
|
||||
|
||||
CREATE TABLE current_time_example (
|
||||
time_id bigserial,
|
||||
time_id integer GENERATED ALWAYS AS IDENTITY,
|
||||
current_timestamp_col timestamp with time zone,
|
||||
clock_timestamp_col timestamp with time zone
|
||||
);
|
||||
|
||||
INSERT INTO current_time_example (current_timestamp_col, clock_timestamp_col)
|
||||
INSERT INTO current_time_example
|
||||
(current_timestamp_col, clock_timestamp_col)
|
||||
(SELECT current_timestamp,
|
||||
clock_timestamp()
|
||||
FROM generate_series(1,1000));
|
||||
@ -63,32 +65,34 @@ SELECT * FROM current_time_example;
|
||||
|
||||
-- Time Zones
|
||||
|
||||
-- Listing 12-4: Showing your PostgreSQL server's default time zone
|
||||
-- Listing 12-4: Viewing your current time zone setting
|
||||
|
||||
SHOW timezone; -- Note: You can see all run-time defaults with SHOW ALL;
|
||||
|
||||
-- Listing 12-5: Showing time zone abbreviations and names
|
||||
|
||||
SELECT * FROM pg_timezone_abbrevs;
|
||||
SELECT * FROM pg_timezone_names;
|
||||
SELECT * FROM pg_timezone_abbrevs ORDER BY abbrev;
|
||||
SELECT * FROM pg_timezone_names ORDER BY name;
|
||||
|
||||
-- Filter to find one
|
||||
SELECT * FROM pg_timezone_names
|
||||
WHERE name LIKE 'Europe%';
|
||||
WHERE name LIKE 'Europe%'
|
||||
ORDER BY name;
|
||||
|
||||
-- Listing 12-6: Setting the time zone for a client session
|
||||
|
||||
SET timezone TO 'US/Pacific';
|
||||
SET TIME ZONE 'US/Pacific';
|
||||
|
||||
CREATE TABLE time_zone_test (
|
||||
test_date timestamp with time zone
|
||||
);
|
||||
INSERT INTO time_zone_test VALUES ('2020-01-01 4:00');
|
||||
INSERT INTO time_zone_test VALUES ('2023-01-01 4:00');
|
||||
INSERT INTO time_zone_test VALUES ('2023-07-01 4:00');
|
||||
|
||||
SELECT test_date
|
||||
FROM time_zone_test;
|
||||
|
||||
SET timezone TO 'US/Eastern';
|
||||
SET TIME ZONE 'US/Eastern';
|
||||
|
||||
SELECT test_date
|
||||
FROM time_zone_test;
|
||||
@ -99,28 +103,28 @@ FROM time_zone_test;
|
||||
|
||||
-- Math with dates!
|
||||
|
||||
SELECT '9/30/1929'::date - '9/27/1929'::date;
|
||||
SELECT '9/30/1929'::date + '5 years'::interval;
|
||||
SELECT '1929-09-30'::date - '1929-09-27'::date;
|
||||
SELECT '1929-09-30'::date + '5 years'::interval;
|
||||
|
||||
|
||||
-- Taxi Rides
|
||||
|
||||
-- Listing 12-7: Creating a table and importing NYC yellow taxi data
|
||||
|
||||
CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 (
|
||||
trip_id bigserial PRIMARY KEY,
|
||||
vendor_id varchar(1) NOT NULL,
|
||||
CREATE TABLE nyc_yellow_taxi_trips (
|
||||
trip_id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
|
||||
vendor_id text NOT NULL,
|
||||
tpep_pickup_datetime timestamp with time zone NOT NULL,
|
||||
tpep_dropoff_datetime timestamp with time zone NOT NULL,
|
||||
passenger_count integer NOT NULL,
|
||||
trip_distance numeric(8,2) NOT NULL,
|
||||
pickup_longitude numeric(18,15) NOT NULL,
|
||||
pickup_latitude numeric(18,15) NOT NULL,
|
||||
rate_code_id varchar(2) NOT NULL,
|
||||
store_and_fwd_flag varchar(1) NOT NULL,
|
||||
rate_code_id text NOT NULL,
|
||||
store_and_fwd_flag text NOT NULL,
|
||||
dropoff_longitude numeric(18,15) NOT NULL,
|
||||
dropoff_latitude numeric(18,15) NOT NULL,
|
||||
payment_type varchar(1) NOT NULL,
|
||||
payment_type text NOT NULL,
|
||||
fare_amount numeric(9,2) NOT NULL,
|
||||
extra numeric(9,2) NOT NULL,
|
||||
mta_tax numeric(5,2) NOT NULL,
|
||||
@ -130,7 +134,7 @@ CREATE TABLE nyc_yellow_taxi_trips_2016_06_01 (
|
||||
total_amount numeric(9,2) NOT NULL
|
||||
);
|
||||
|
||||
COPY nyc_yellow_taxi_trips_2016_06_01 (
|
||||
COPY nyc_yellow_taxi_trips (
|
||||
vendor_id,
|
||||
tpep_pickup_datetime,
|
||||
tpep_dropoff_datetime,
|
||||
@ -151,20 +155,23 @@ COPY nyc_yellow_taxi_trips_2016_06_01 (
|
||||
improvement_surcharge,
|
||||
total_amount
|
||||
)
|
||||
FROM 'C:\YourDirectory\yellow_tripdata_2016_06_01.csv'
|
||||
WITH (FORMAT CSV, HEADER, DELIMITER ',');
|
||||
-- FROM 'C:\YourDirectory\nyc_yellow_taxi_trips.csv'
|
||||
FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_12/nyc_yellow_taxi_trips.csv'
|
||||
WITH (FORMAT CSV, HEADER);
|
||||
|
||||
CREATE INDEX tpep_pickup_idx
|
||||
ON nyc_yellow_taxi_trips_2016_06_01 (tpep_pickup_datetime);
|
||||
ON nyc_yellow_taxi_trips (tpep_pickup_datetime);
|
||||
|
||||
SELECT count(*) FROM nyc_yellow_taxi_trips_2016_06_01;
|
||||
SELECT count(*) FROM nyc_yellow_taxi_trips;
|
||||
|
||||
-- Listing 12-8: Counting taxi trips by hour
|
||||
|
||||
SET TIME ZONE 'US/Eastern'; -- Set this if your PostgreSQL server defaults to a time zone other than US/Eastern
|
||||
|
||||
SELECT
|
||||
date_part('hour', tpep_pickup_datetime) AS trip_hour,
|
||||
count(*)
|
||||
FROM nyc_yellow_taxi_trips_2016_06_01
|
||||
FROM nyc_yellow_taxi_trips
|
||||
GROUP BY trip_hour
|
||||
ORDER BY trip_hour;
|
||||
|
||||
@ -174,12 +181,13 @@ COPY
|
||||
(SELECT
|
||||
date_part('hour', tpep_pickup_datetime) AS trip_hour,
|
||||
count(*)
|
||||
FROM nyc_yellow_taxi_trips_2016_06_01
|
||||
FROM nyc_yellow_taxi_trips
|
||||
GROUP BY trip_hour
|
||||
ORDER BY trip_hour
|
||||
)
|
||||
TO 'C:\YourDirectory\hourly_pickups_2016_06_01.csv'
|
||||
WITH (FORMAT CSV, HEADER, DELIMITER ',');
|
||||
-- TO 'C:\YourDirectory\hourly_taxi_pickups.csv'
|
||||
TO '/Users/adebarros/Desktop/hourly_taxi_pickups.csv'
|
||||
WITH (FORMAT CSV, HEADER);
|
||||
|
||||
-- Listing 12-10: Calculating median trip time by hour
|
||||
|
||||
@ -188,29 +196,29 @@ SELECT
|
||||
percentile_cont(.5)
|
||||
WITHIN GROUP (ORDER BY
|
||||
tpep_dropoff_datetime - tpep_pickup_datetime) AS median_trip
|
||||
FROM nyc_yellow_taxi_trips_2016_06_01
|
||||
FROM nyc_yellow_taxi_trips
|
||||
GROUP BY trip_hour
|
||||
ORDER BY trip_hour;
|
||||
|
||||
-- Listing 12-11: Creating a table to hold train trip data
|
||||
|
||||
SET timezone TO 'US/Central';
|
||||
|
||||
CREATE TABLE train_rides (
|
||||
trip_id bigserial PRIMARY KEY,
|
||||
segment varchar(50) NOT NULL,
|
||||
trip_id bigint GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
|
||||
segment text NOT NULL,
|
||||
departure timestamp with time zone NOT NULL,
|
||||
arrival timestamp with time zone NOT NULL
|
||||
);
|
||||
|
||||
INSERT INTO train_rides (segment, departure, arrival)
|
||||
VALUES
|
||||
('Chicago to New York', '2017-11-13 21:30 CST', '2017-11-14 18:23 EST'),
|
||||
('New York to New Orleans', '2017-11-15 14:15 EST', '2017-11-16 19:32 CST'),
|
||||
('New Orleans to Los Angeles', '2017-11-17 13:45 CST', '2017-11-18 9:00 PST'),
|
||||
('Los Angeles to San Francisco', '2017-11-19 10:10 PST', '2017-11-19 21:24 PST'),
|
||||
('San Francisco to Denver', '2017-11-20 9:10 PST', '2017-11-21 18:38 MST'),
|
||||
('Denver to Chicago', '2017-11-22 19:10 MST', '2017-11-23 14:50 CST');
|
||||
('Chicago to New York', '2020-11-13 21:30 CST', '2020-11-14 18:23 EST'),
|
||||
('New York to New Orleans', '2020-11-15 14:15 EST', '2020-11-16 19:32 CST'),
|
||||
('New Orleans to Los Angeles', '2020-11-17 13:45 CST', '2020-11-18 9:00 PST'),
|
||||
('Los Angeles to San Francisco', '2020-11-19 10:10 PST', '2020-11-19 21:24 PST'),
|
||||
('San Francisco to Denver', '2020-11-20 9:10 PST', '2020-11-21 18:38 MST'),
|
||||
('Denver to Chicago', '2020-11-22 19:10 MST', '2020-11-23 14:50 CST');
|
||||
|
||||
SET TIME ZONE 'US/Central';
|
||||
|
||||
SELECT * FROM train_rides;
|
||||
|
||||
|
||||
|
Can't render this file because it is too large.
|
@ -729,3 +729,70 @@ FROM pls_fy2018_libraries
|
||||
WHERE popu_lsa >= 250000;
|
||||
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
-- Chapter 12: Working with Dates and Times
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
-- 1. Using the New York City taxi data, calculate the length of each ride using
|
||||
-- the pickup and drop-off timestamps. Sort the query results from the longest
|
||||
-- ride to the shortest. Do you notice anything about the longest or shortest
|
||||
-- trips that you might want to ask city officials about?
|
||||
|
||||
-- Answer: More than 480 of the trips last more than 10 hours, which seems
|
||||
-- excessive. Moreover, two records have drop-off times before the pickup time,
|
||||
-- and several have pickup and drop-off times that are the same. It's worth
|
||||
-- asking whether these records have timestamp errors.
|
||||
|
||||
SELECT
|
||||
trip_id,
|
||||
tpep_pickup_datetime,
|
||||
tpep_dropoff_datetime,
|
||||
tpep_dropoff_datetime - tpep_pickup_datetime AS length_of_ride
|
||||
FROM nyc_yellow_taxi_trips
|
||||
ORDER BY length_of_ride DESC;
|
||||
|
||||
-- 2. Using the AT TIME ZONE keywords, write a query that displays the date and
|
||||
-- time for London, Johannesburg, Moscow, and Melbourne the moment January 1,
|
||||
-- 2100, arrives in New York City.
|
||||
|
||||
-- Answer:
|
||||
|
||||
SELECT '2100-01-01 00:00:00-05' AT TIME ZONE 'US/Eastern' AS new_york,
|
||||
'2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/London' AS london,
|
||||
'2100-01-01 00:00:00-05' AT TIME ZONE 'Africa/Johannesburg' AS johannesburg,
|
||||
'2100-01-01 00:00:00-05' AT TIME ZONE 'Europe/Moscow' AS moscow,
|
||||
'2100-01-01 00:00:00-05' AT TIME ZONE 'Australia/Melbourne' AS melbourne;
|
||||
|
||||
-- 3. As a bonus challenge, use the statistics functions in Chapter 11 to
|
||||
-- calculate the correlation coefficient and r-squared values using trip time
|
||||
-- and the total_amount column in the New York City taxi data, which represents
|
||||
-- total amount charged to passengers. Do the same with trip_distance and
|
||||
-- total_amount. Limit the query to rides that last three hours or less.
|
||||
|
||||
-- Answer:
|
||||
|
||||
SELECT
|
||||
round(
|
||||
corr(total_amount, (
|
||||
date_part('epoch', tpep_dropoff_datetime) -
|
||||
date_part('epoch', tpep_pickup_datetime)
|
||||
))::numeric, 2
|
||||
) AS amount_time_corr,
|
||||
round(
|
||||
regr_r2(total_amount, (
|
||||
date_part('epoch', tpep_dropoff_datetime) -
|
||||
date_part('epoch', tpep_pickup_datetime)
|
||||
))::numeric, 2
|
||||
) AS amount_time_r2,
|
||||
round(
|
||||
corr(total_amount, trip_distance)::numeric, 2
|
||||
) AS amount_distance_corr,
|
||||
round(
|
||||
regr_r2(total_amount, trip_distance)::numeric, 2
|
||||
) AS amount_distance_r2
|
||||
FROM nyc_yellow_taxi_trips
|
||||
WHERE tpep_dropoff_datetime - tpep_pickup_datetime <= '3 hours'::interval;
|
||||
|
||||
-- Note: Both correlations are strong, with r values of 0.80 or higher. We'd
|
||||
-- expect this given the cost of a taxi ride is based on both time and distance.
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user