386 lines
13 KiB
SQL
386 lines
13 KiB
SQL
---------------------------------------------------------------------------
|
|
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
|
|
-- by Anthony DeBarros
|
|
|
|
-- Chapter 14 Code Examples
|
|
----------------------------------------------------------------------------
|
|
|
|
|
|
-- Commonly used string functions
|
|
-- Full list at https://www.postgresql.org/docs/current/functions-string.html
|
|
|
|
-- Case formatting
|
|
SELECT upper('Neal7');
|
|
SELECT lower('Randy');
|
|
SELECT initcap('at the end of the day');
|
|
-- Note initcap's imperfect for acronyms
|
|
SELECT initcap('Practical SQL');
|
|
|
|
-- Character Information
|
|
SELECT char_length(' Pat ');
|
|
SELECT length(' Pat ');
|
|
SELECT position(', ' in 'Tan, Bella');
|
|
|
|
-- Removing characters
|
|
SELECT trim('s' from 'socks');
|
|
SELECT trim(trailing 's' from 'socks');
|
|
SELECT trim(' Pat ');
|
|
SELECT char_length(trim(' Pat ')); -- note the length change
|
|
SELECT ltrim('socks', 's');
|
|
SELECT rtrim('socks', 's');
|
|
|
|
-- Extracting and replacing characters
|
|
SELECT left('703-555-1212', 3);
|
|
SELECT right('703-555-1212', 8);
|
|
SELECT replace('bat', 'b', 'c');
|
|
|
|
|
|
-- Table 14-2: Regular Expression Matching Examples
|
|
|
|
-- Any character one or more times
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '.+');
|
|
-- One or two digits followed by a space and a.m. or p.m. in a noncapture group
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{1,2} (?:a.m.|p.m.)');
|
|
-- One or more word characters at the start
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '^\w+');
|
|
-- One or more word characters followed by any character at the end.
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\w+.$');
|
|
-- The words May or June
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May|June');
|
|
-- Four digits
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{4}');
|
|
-- May followed by a space, digit, comma, space, and four digits.
|
|
SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May \d, \d{4}');
|
|
|
|
-- Listing 14-1: Using regular expressions in a WHERE clause
|
|
|
|
SELECT county_name
|
|
FROM us_counties_pop_est_2019
|
|
WHERE county_name ~* '(lade|lare)'
|
|
ORDER BY county_name;
|
|
|
|
SELECT county_name
|
|
FROM us_counties_pop_est_2019
|
|
WHERE county_name ~* 'ash' AND county_name !~ 'Wash'
|
|
ORDER BY county_name;
|
|
|
|
-- Listing 14-2: Regular expression functions to replace and split text
|
|
|
|
SELECT regexp_replace('05/12/2024', '\d{4}', '2023');
|
|
|
|
SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ',');
|
|
|
|
SELECT regexp_split_to_array('Phil Mike Tony Steve', ' ');
|
|
|
|
-- Listing 14-3: Finding an array length
|
|
|
|
SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1);
|
|
|
|
|
|
-- Turning Text to Data with Regular Expression Functions
|
|
|
|
-- Listing 14-5: Creating and loading the crime_reports table
|
|
-- Data from https://sheriff.loudoun.gov/dailycrime
|
|
|
|
CREATE TABLE crime_reports (
|
|
crime_id integer PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
|
|
case_number text,
|
|
date_1 timestamptz, -- note: this is the PostgreSQL shortcut for timestamp with time zone
|
|
date_2 timestamptz, -- note: this is the PostgreSQL shortcut for timestamp with time zone
|
|
street text,
|
|
city text,
|
|
crime_type text,
|
|
description text,
|
|
original_text text NOT NULL
|
|
);
|
|
|
|
COPY crime_reports (original_text)
|
|
FROM 'C:\YourDirectory\crime_reports.csv'
|
|
WITH (FORMAT CSV, HEADER OFF, QUOTE '"');
|
|
|
|
SELECT original_text FROM crime_reports;
|
|
|
|
-- Listing 14-6: Using regexp_match() to find the first date
|
|
SELECT crime_id,
|
|
regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}')
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Listing 14-7: Using the regexp_matches() function with the 'g' flag
|
|
SELECT crime_id,
|
|
regexp_matches(original_text, '\d{1,2}\/\d{1,2}\/\d{2}', 'g')
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Listing 14-8: Using regexp_match() to find the second date
|
|
-- Note that the result includes an unwanted hyphen
|
|
SELECT crime_id,
|
|
regexp_match(original_text, '-\d{1,2}\/\d{1,2}\/\d{2}')
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Listing 14-9: Using a capture group to return only the date
|
|
-- Eliminates the hyphen
|
|
SELECT crime_id,
|
|
regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})')
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Listing 14-10: Matching case number, date, crime type, and city
|
|
|
|
SELECT
|
|
regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number,
|
|
regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') AS date_1,
|
|
regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type,
|
|
regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n')
|
|
AS city
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Bonus: Get all parsed elements at once
|
|
|
|
SELECT crime_id,
|
|
regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') AS date_1,
|
|
CASE WHEN EXISTS (SELECT regexp_matches(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})'))
|
|
THEN regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})')
|
|
ELSE NULL
|
|
END AS date_2,
|
|
regexp_match(original_text, '\/\d{2}\n(\d{4})') AS hour_1,
|
|
CASE WHEN EXISTS (SELECT regexp_matches(original_text, '\/\d{2}\n\d{4}-(\d{4})'))
|
|
THEN regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})')
|
|
ELSE NULL
|
|
END AS hour_2,
|
|
regexp_match(original_text, 'hrs.\n(\d+ .+(?:Sq.|Plz.|Dr.|Ter.|Rd.))') AS street,
|
|
regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n') AS city,
|
|
regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type,
|
|
regexp_match(original_text, ':\s(.+)(?:C0|SO)') AS description,
|
|
regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Listing 14-11: Retrieving a value from within an array
|
|
|
|
SELECT
|
|
crime_id,
|
|
(regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1]
|
|
AS case_number
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
-- Listing 14-12: Updating the crime_reports date_1 column
|
|
|
|
UPDATE crime_reports
|
|
SET date_1 =
|
|
(
|
|
(regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1]
|
|
|| ' ' ||
|
|
(regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1]
|
|
||' US/Eastern'
|
|
)::timestamptz
|
|
RETURNING crime_id, date_1, original_text;
|
|
|
|
-- Listing 14-13: Updating all crime_reports columns
|
|
|
|
UPDATE crime_reports
|
|
SET date_1 =
|
|
(
|
|
(regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1]
|
|
|| ' ' ||
|
|
(regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1]
|
|
||' US/Eastern'
|
|
)::timestamptz,
|
|
|
|
date_2 =
|
|
CASE
|
|
-- if there is no second date but there is a second hour
|
|
WHEN (SELECT regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})') IS NULL)
|
|
AND (SELECT regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') IS NOT NULL)
|
|
THEN
|
|
((regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1]
|
|
|| ' ' ||
|
|
(regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})'))[1]
|
|
||' US/Eastern'
|
|
)::timestamptz
|
|
|
|
-- if there is both a second date and second hour
|
|
WHEN (SELECT regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})') IS NOT NULL)
|
|
AND (SELECT regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') IS NOT NULL)
|
|
THEN
|
|
((regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})'))[1]
|
|
|| ' ' ||
|
|
(regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})'))[1]
|
|
||' US/Eastern'
|
|
)::timestamptz
|
|
END,
|
|
street = (regexp_match(original_text, 'hrs.\n(\d+ .+(?:Sq.|Plz.|Dr.|Ter.|Rd.))'))[1],
|
|
city = (regexp_match(original_text,
|
|
'(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n'))[1],
|
|
crime_type = (regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):'))[1],
|
|
description = (regexp_match(original_text, ':\s(.+)(?:C0|SO)'))[1],
|
|
case_number = (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1];
|
|
|
|
-- Listing 14-14: Viewing selected crime data
|
|
|
|
SELECT date_1,
|
|
street,
|
|
city,
|
|
crime_type
|
|
FROM crime_reports
|
|
ORDER BY crime_id;
|
|
|
|
|
|
-- FULL TEXT SEARCH
|
|
|
|
-- Full-text search operators:
|
|
-- & (AND)
|
|
-- | (OR)
|
|
-- ! (NOT)
|
|
|
|
-- Note: You can view installed PostgreSQL search language configurations by running:
|
|
SELECT cfgname FROM pg_ts_config;
|
|
|
|
|
|
-- Listing 14-15: Converting text to tsvector data
|
|
|
|
SELECT to_tsvector('english', 'I am walking across the sitting room to sit with you.');
|
|
|
|
-- Listing 14-16: Converting search terms to tsquery data
|
|
|
|
SELECT to_tsquery('english', 'walking & sitting');
|
|
|
|
-- Listing 14-17: Querying a tsvector type with a tsquery
|
|
|
|
SELECT to_tsvector('english', 'I am walking across the sitting room') @@
|
|
to_tsquery('english', 'walking & sitting');
|
|
|
|
SELECT to_tsvector('english', 'I am walking across the sitting room') @@
|
|
to_tsquery('english', 'walking & running');
|
|
|
|
-- Listing 14-18: Creating and filling the president_speeches table
|
|
|
|
-- Sources:
|
|
-- https://archive.org/details/State-of-the-Union-Addresses-1945-2006
|
|
-- https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union
|
|
-- https://www.eisenhower.archives.gov/all_about_ike/speeches.html
|
|
|
|
CREATE TABLE president_speeches (
|
|
president text NOT NULL,
|
|
title text NOT NULL,
|
|
speech_date date NOT NULL,
|
|
speech_text text NOT NULL,
|
|
search_speech_text tsvector,
|
|
CONSTRAINT speech_key PRIMARY KEY (president, speech_date)
|
|
);
|
|
|
|
COPY president_speeches (president, title, speech_date, speech_text)
|
|
FROM 'C:\YourDirectory\president_speeches.csv'
|
|
WITH (FORMAT CSV, DELIMITER '|', HEADER OFF, QUOTE '@');
|
|
|
|
SELECT * FROM president_speeches ORDER BY speech_date;
|
|
|
|
-- Listing 14-19: Converting speeches to tsvector in the search_speech_text column
|
|
|
|
UPDATE president_speeches
|
|
SET search_speech_text = to_tsvector('english', speech_text);
|
|
|
|
-- Listing 14-20: Creating a GIN index for text search
|
|
|
|
CREATE INDEX search_idx ON president_speeches USING gin(search_speech_text);
|
|
|
|
-- Listing 14-21: Finding speeches containing the word "Vietnam"
|
|
|
|
SELECT president, speech_date
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@ to_tsquery('english', 'Vietnam')
|
|
ORDER BY speech_date;
|
|
|
|
-- Listing 14-22: Displaying search results with ts_headline()
|
|
|
|
SELECT president,
|
|
speech_date,
|
|
ts_headline(speech_text, to_tsquery('english', 'tax'),
|
|
'StartSel = <,
|
|
StopSel = >,
|
|
MinWords=5,
|
|
MaxWords=7,
|
|
MaxFragments=1')
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@ to_tsquery('english', 'tax')
|
|
ORDER BY speech_date;
|
|
|
|
|
|
-- Listing 14-23: Finding speeches with the word "transportation" but not "roads"
|
|
|
|
SELECT president,
|
|
speech_date,
|
|
ts_headline(speech_text,
|
|
to_tsquery('english', 'transportation & !roads'),
|
|
'StartSel = <,
|
|
StopSel = >,
|
|
MinWords=5,
|
|
MaxWords=7,
|
|
MaxFragments=1')
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@
|
|
to_tsquery('english', 'transportation & !roads')
|
|
ORDER BY speech_date;
|
|
|
|
-- Listing 14-24: Finding speeches where "defense" follows "military"
|
|
|
|
SELECT president,
|
|
speech_date,
|
|
ts_headline(speech_text,
|
|
to_tsquery('english', 'military <-> defense'),
|
|
'StartSel = <,
|
|
StopSel = >,
|
|
MinWords=5,
|
|
MaxWords=7,
|
|
MaxFragments=1')
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@
|
|
to_tsquery('english', 'military <-> defense')
|
|
ORDER BY speech_date;
|
|
|
|
-- Bonus: Example with a distance of 2:
|
|
SELECT president,
|
|
speech_date,
|
|
ts_headline(speech_text,
|
|
to_tsquery('english', 'military <2> defense'),
|
|
'StartSel = <,
|
|
StopSel = >,
|
|
MinWords=5,
|
|
MaxWords=7,
|
|
MaxFragments=2')
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@
|
|
to_tsquery('english', 'military <2> defense')
|
|
ORDER BY speech_date;
|
|
|
|
-- Listing 14-25: Scoring relevance with ts_rank()
|
|
|
|
SELECT president,
|
|
speech_date,
|
|
ts_rank(search_speech_text,
|
|
to_tsquery('english', 'war & security & threat & enemy'))
|
|
AS score
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@
|
|
to_tsquery('english', 'war & security & threat & enemy')
|
|
ORDER BY score DESC
|
|
LIMIT 5;
|
|
|
|
-- Listing 14-26: Normalizing ts_rank() by speech length
|
|
|
|
SELECT president,
|
|
speech_date,
|
|
ts_rank(search_speech_text,
|
|
to_tsquery('english', 'war & security & threat & enemy'), 2)::numeric
|
|
AS score
|
|
FROM president_speeches
|
|
WHERE search_speech_text @@
|
|
to_tsquery('english', 'war & security & threat & enemy')
|
|
ORDER BY score DESC
|
|
LIMIT 5;
|
|
|
|
|