Chapter 14 first revision

This commit is contained in:
anthonydb 2021-01-11 21:52:09 -05:00
parent 1a389ccd3f
commit 602ac5e580
4 changed files with 8234 additions and 4724 deletions

View File

@ -1,14 +1,10 @@
-- FIRST EDITION FILE; IGNORE ---------------------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition
--------------------------------------------------------------
-- Practical SQL: A Beginner's Guide to Storytelling with Data
-- by Anthony DeBarros -- by Anthony DeBarros
-- Chapter 14 Code Examples -- Chapter 14 Code Examples
-------------------------------------------------------------- ----------------------------------------------------------------------------
-- Commonly used string functions -- Commonly used string functions
-- Full list at https://www.postgresql.org/docs/current/static/functions-string.html -- Full list at https://www.postgresql.org/docs/current/static/functions-string.html
@ -39,70 +35,100 @@ SELECT right('703-555-1212', 8);
SELECT replace('bat', 'b', 'c'); SELECT replace('bat', 'b', 'c');
-- Table 14-2: Regular Expression Matching Examples -- Table 14-1: Regular Expression Matching Examples
-- Any character one or more times -- Any character one or more times
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '.+'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '.+');
-- One or two digits followed by a space and p.m. -- One or two digits followed by a space and p.m.
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\d{1,2} (?:a.m.|p.m.)'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{1,2} (?:a.m.|p.m.)');
-- One or more word characters at the start -- One or more word characters at the start
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '^\w+'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '^\w+');
-- One or more word characters followed by any character at the end. -- One or more word characters followed by any character at the end.
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\w+.$'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\w+.$');
-- The words May or June -- The words May or June
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from 'May|June'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May|June');
-- Four digits -- Four digits
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from '\d{4}'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{4}');
-- May followed by a space, digit, comma, space, and four digits. -- May followed by a space, digit, comma, space, and four digits.
SELECT substring('The game starts at 7 p.m. on May 2, 2019.' from 'May \d, \d{4}'); SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May \d, \d{4}');
-- Listing 14-1: Using regular expressions in a WHERE clause
SELECT county_name
FROM us_counties_pop_est_2019
WHERE county_name ~* '(.+lade.+|.+lare.+)'
ORDER BY county_name;
SELECT county_name
FROM us_counties_pop_est_2019
WHERE county_name ~* '.+ash.+' AND county_name !~ 'Wash.+'
ORDER BY county_name;
-- Listing 14-2: Regular expression functions to replace and split
SELECT regexp_replace('05/12/2024', '\d{4}', '2023');
SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ',');
SELECT regexp_split_to_array('Phil Mike Tony Steve', ' ');
-- Listing 14-3: Finding an array length
SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1);
-- Turning Text to Data with Regular Expression Functions -- Turning Text to Data with Regular Expression Functions
-- Listing 14-2: Creating and loading the crime_reports table -- Listing 14-5: Creating and loading the crime_reports table
-- Data from https://sheriff.loudoun.gov/dailycrime -- Data from https://sheriff.loudoun.gov/dailycrime
CREATE TABLE crime_reports ( CREATE TABLE crime_reports (
crime_id bigserial PRIMARY KEY, crime_id integer PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
case_number text,
date_1 timestamp with time zone, date_1 timestamp with time zone,
date_2 timestamp with time zone, date_2 timestamp with time zone,
street varchar(250), street text,
city varchar(100), city text,
crime_type varchar(100), crime_type text,
description text, description text,
case_number varchar(50),
original_text text NOT NULL original_text text NOT NULL
); );
COPY crime_reports (original_text) COPY crime_reports (original_text)
FROM 'C:\YourDirectory\crime_reports.csv' -- FROM 'C:\YourDirectory\crime_reports.csv'
FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_14/crime_reports.csv'
WITH (FORMAT CSV, HEADER OFF, QUOTE '"'); WITH (FORMAT CSV, HEADER OFF, QUOTE '"');
SELECT original_text FROM crime_reports; SELECT original_text FROM crime_reports;
-- Listing 14-3: Using regexp_match() to find the first date -- Listing 14-6: Using regexp_match() to find the first date
SELECT crime_id, SELECT crime_id,
regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}')
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-4: Using the regexp_matches() function with the 'g' flag -- Listing 14-7: Using the regexp_matches() function with the 'g' flag
SELECT crime_id, SELECT crime_id,
regexp_matches(original_text, '\d{1,2}\/\d{1,2}\/\d{2}', 'g') regexp_matches(original_text, '\d{1,2}\/\d{1,2}\/\d{2}', 'g')
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-5: Using regexp_match() to find the second date -- Listing 14-8: Using regexp_match() to find the second date
-- Note that the result includes an unwanted hyphen -- Note that the result includes an unwanted hyphen
SELECT crime_id, SELECT crime_id,
regexp_match(original_text, '-\d{1,2}\/\d{1,2}\/\d{1,2}') regexp_match(original_text, '-\d{1,2}\/\d{1,2}\/\d{1,2}')
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-6: Using a capture group to return only the date -- Listing 14-9: Using a capture group to return only the date
-- Eliminates the hyphen -- Eliminates the hyphen
SELECT crime_id, SELECT crime_id,
regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})') regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{1,2})')
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-7: Matching case number, date, crime type, and city -- Listing 14-10: Matching case number, date, crime type, and city
SELECT SELECT
regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number, regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number,
@ -110,7 +136,8 @@ SELECT
regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type, regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type,
regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n') regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n')
AS city AS city
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Bonus: Get all parsed elements at once -- Bonus: Get all parsed elements at once
@ -130,17 +157,19 @@ SELECT crime_id,
regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type, regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type,
regexp_match(original_text, ':\s(.+)(?:C0|SO)') AS description, regexp_match(original_text, ':\s(.+)(?:C0|SO)') AS description,
regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-8: Retrieving a value from within an array -- Listing 14-11: Retrieving a value from within an array
SELECT SELECT
crime_id, crime_id,
(regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1] (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1]
AS case_number AS case_number
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-9: Updating the crime_reports date_1 column -- Listing 14-12: Updating the crime_reports date_1 column
UPDATE crime_reports UPDATE crime_reports
SET date_1 = SET date_1 =
@ -149,14 +178,10 @@ SET date_1 =
|| ' ' || || ' ' ||
(regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1] (regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1]
||' US/Eastern' ||' US/Eastern'
)::timestamptz; )::timestamptz
RETURNING crime_id, date_1, original_text;
SELECT crime_id, -- Listing 14-13: Updating all crime_reports columns
date_1,
original_text
FROM crime_reports;
-- Listing 14-10: Updating all crime_reports columns
UPDATE crime_reports UPDATE crime_reports
SET date_1 = SET date_1 =
@ -198,38 +223,14 @@ SET date_1 =
description = (regexp_match(original_text, ':\s(.+)(?:C0|SO)'))[1], description = (regexp_match(original_text, ':\s(.+)(?:C0|SO)'))[1],
case_number = (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1]; case_number = (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1];
-- Listing 14-11: Viewing selected crime data -- Listing 14-14: Viewing selected crime data
SELECT date_1, SELECT date_1,
street, street,
city, city,
crime_type crime_type
FROM crime_reports; FROM crime_reports
ORDER BY crime_id;
-- Listing 14-12: Using regular expressions in a WHERE clause
SELECT geo_name
FROM us_counties_2010
WHERE geo_name ~* '(.+lade.+|.+lare.+)'
ORDER BY geo_name;
SELECT geo_name
FROM us_counties_2010
WHERE geo_name ~* '.+ash.+' AND geo_name !~ 'Wash.+'
ORDER BY geo_name;
-- Listing 14-13: Regular expression functions to replace and split
SELECT regexp_replace('05/12/2018', '\d{4}', '2017');
SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ',');
SELECT regexp_split_to_array('Phil Mike Tony Steve', ' ');
-- Listing 14-14: Finding an array length
SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1);
-- FULL TEXT SEARCH -- FULL TEXT SEARCH
@ -261,16 +262,17 @@ SELECT to_tsvector('I am walking across the sitting room') @@ to_tsquery('walkin
-- https://www.eisenhower.archives.gov/all_about_ike/speeches.html -- https://www.eisenhower.archives.gov/all_about_ike/speeches.html
CREATE TABLE president_speeches ( CREATE TABLE president_speeches (
sotu_id serial PRIMARY KEY, president text NOT NULL,
president varchar(100) NOT NULL, title text NOT NULL,
title varchar(250) NOT NULL,
speech_date date NOT NULL, speech_date date NOT NULL,
speech_text text NOT NULL, speech_text text NOT NULL,
search_speech_text tsvector search_speech_text tsvector,
CONSTRAINT speech_key PRIMARY KEY (president, speech_date)
); );
COPY president_speeches (president, title, speech_date, speech_text) COPY president_speeches (president, title, speech_date, speech_text)
FROM 'C:\YourDirectory\sotu-1946-1977.csv' -- FROM 'C:\YourDirectory\president_speeches.csv'
FROM '/Users/adebarros/Dropbox/DataMonky/Book-Writing/PracticalSQL_2e/Code-Repo/Chapter_14/president_speeches.csv'
WITH (FORMAT CSV, DELIMITER '|', HEADER OFF, QUOTE '@'); WITH (FORMAT CSV, DELIMITER '|', HEADER OFF, QUOTE '@');
SELECT * FROM president_speeches; SELECT * FROM president_speeches;
@ -295,14 +297,15 @@ ORDER BY speech_date;
SELECT president, SELECT president,
speech_date, speech_date,
ts_headline(speech_text, to_tsquery('Vietnam'), ts_headline(speech_text, to_tsquery('tax'),
'StartSel = <, 'StartSel = <,
StopSel = >, StopSel = >,
MinWords=5, MinWords=5,
MaxWords=7, MaxWords=7,
MaxFragments=1') MaxFragments=1')
FROM president_speeches FROM president_speeches
WHERE search_speech_text @@ to_tsquery('Vietnam'); WHERE search_speech_text @@ to_tsquery('tax');
-- Listing 14-23: Finding speeches with the word "transportation" but not "roads" -- Listing 14-23: Finding speeches with the word "transportation" but not "roads"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -867,3 +867,76 @@ AS (flavor text,
midtown bigint, midtown bigint,
uptown bigint); uptown bigint);
----------------------------------------------------------------------------
-- Chapter 14: Mining Text to Find Meaningful Data
----------------------------------------------------------------------------
-- 1. The style guide of a publishing company you're writing for wants you to
-- avoid commas before suffixes in names. But there are several names like
-- Alvarez, Jr. and Williams, Sr. in your author database. Which functions can
-- you use to remove the comma? Would a regular expression function help?
-- How would you capture just the suffixes to place them into a separate column?
-- Answer: You can use either the standard SQL replace() function or the
-- PostgreSQL regexp_replace() function:
SELECT replace('Williams, Sr.', ', ', ' ');
SELECT regexp_replace('Williams, Sr.', ', ', ' ');
-- Answer: To capture just the suffixes, search for characters after a comma
-- and space and place those inside a match group:
SELECT (regexp_match('Williams, Sr.', '.*, (.*)'))[1];
-- 2. Using any one of the presidents' speeches addresses, count the number of
-- unique words that are five characters or more. Hint: you can use
-- regexp_split_to_table() in a subquery to create a table of words to count.
-- Bonus: remove commas and periods at the end of each word.
-- Answer:
-- This query uses a Common Table Expression to first separate each word
-- in the text into a separate row in a table named word_list. Then the SELECT
-- statement counts the words, which are cleaned up with two operations. First,
-- several nested replace functions remove commas, periods, and colons. Second,
-- all words are converted to lowercase so that when we count we group words
-- that may appear with various cases (e.g., "Military" and "military").
WITH
word_list (word)
AS
(
SELECT regexp_split_to_table(speech_text, '\s') AS word
FROM president_speeches
WHERE speech_date = '1946-01-21'
)
SELECT lower(
replace(replace(replace(word, ',', ''), '.', ''), ':', '')
) AS cleaned_word,
count(*)
FROM word_list
WHERE length(word) >= 5
GROUP BY cleaned_word
ORDER BY count(*) DESC;
-- 3. Rewrite the query in Listing 14-25 using the ts_rank_cd() function
-- instead of ts_rank(). According to th PostgreSQL documentation, ts_rank_cd()
-- computes cover density, which takes into account how close the lexeme search
-- terms are to each other. Does using the ts_rank_cd() function significantly
-- change the results?
-- Answer:
-- The ranking does change, highlighting post-9/11 speeches by George W. Bush.
-- The change might be more or less pronounced given another set of texts.
SELECT president,
speech_date,
ts_rank_cd(search_speech_text, search_query, 2) AS rank_score
FROM president_speeches,
to_tsquery('war & security & threat & enemy') search_query
WHERE search_speech_text @@ search_query
ORDER BY rank_score DESC
LIMIT 5;