--------------------------------------------------------------------------- -- Practical SQL: A Beginner's Guide to Storytelling with Data, 2nd Edition -- by Anthony DeBarros -- Chapter 14 Code Examples ---------------------------------------------------------------------------- -- Commonly used string functions -- Full list at https://www.postgresql.org/docs/current/functions-string.html -- Case formatting SELECT upper('Neal7'); SELECT lower('Randy'); SELECT initcap('at the end of the day'); -- Note initcap's imperfect for acronyms SELECT initcap('Practical SQL'); -- Character Information SELECT char_length(' Pat '); SELECT length(' Pat '); SELECT position(', ' in 'Tan, Bella'); -- Removing characters SELECT trim('s' from 'socks'); SELECT trim(trailing 's' from 'socks'); SELECT trim(' Pat '); SELECT char_length(trim(' Pat ')); -- note the length change SELECT ltrim('socks', 's'); SELECT rtrim('socks', 's'); -- Extracting and replacing characters SELECT left('703-555-1212', 3); SELECT right('703-555-1212', 8); SELECT replace('bat', 'b', 'c'); -- Table 14-2: Regular Expression Matching Examples -- Any character one or more times SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '.+'); -- One or two digits followed by a space and a.m. or p.m. in a noncapture group SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{1,2} (?:a.m.|p.m.)'); -- One or more word characters at the start SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '^\w+'); -- One or more word characters followed by any character at the end. SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\w+.$'); -- The words May or June SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May|June'); -- Four digits SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from '\d{4}'); -- May followed by a space, digit, comma, space, and four digits. SELECT substring('The game starts at 7 p.m. on May 2, 2024.' from 'May \d, \d{4}'); -- Listing 14-1: Using regular expressions in a WHERE clause SELECT county_name FROM us_counties_pop_est_2019 WHERE county_name ~* '(lade|lare)' ORDER BY county_name; SELECT county_name FROM us_counties_pop_est_2019 WHERE county_name ~* 'ash' AND county_name !~ 'Wash' ORDER BY county_name; -- Listing 14-2: Regular expression functions to replace and split text SELECT regexp_replace('05/12/2024', '\d{4}', '2023'); SELECT regexp_split_to_table('Four,score,and,seven,years,ago', ','); SELECT regexp_split_to_array('Phil Mike Tony Steve', ' '); -- Listing 14-3: Finding an array length SELECT array_length(regexp_split_to_array('Phil Mike Tony Steve', ' '), 1); -- Turning Text to Data with Regular Expression Functions -- Listing 14-5: Creating and loading the crime_reports table -- Data from https://sheriff.loudoun.gov/dailycrime CREATE TABLE crime_reports ( crime_id integer PRIMARY KEY GENERATED ALWAYS AS IDENTITY, case_number text, date_1 timestamptz, -- note: this is the PostgreSQL shortcut for timestamp with time zone date_2 timestamptz, -- note: this is the PostgreSQL shortcut for timestamp with time zone street text, city text, crime_type text, description text, original_text text NOT NULL ); COPY crime_reports (original_text) FROM 'C:\YourDirectory\crime_reports.csv' WITH (FORMAT CSV, HEADER OFF, QUOTE '"'); SELECT original_text FROM crime_reports; -- Listing 14-6: Using regexp_match() to find the first date SELECT crime_id, regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') FROM crime_reports ORDER BY crime_id; -- Listing 14-7: Using the regexp_matches() function with the 'g' flag SELECT crime_id, regexp_matches(original_text, '\d{1,2}\/\d{1,2}\/\d{2}', 'g') FROM crime_reports ORDER BY crime_id; -- Listing 14-8: Using regexp_match() to find the second date -- Note that the result includes an unwanted hyphen SELECT crime_id, regexp_match(original_text, '-\d{1,2}\/\d{1,2}\/\d{2}') FROM crime_reports ORDER BY crime_id; -- Listing 14-9: Using a capture group to return only the date -- Eliminates the hyphen SELECT crime_id, regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})') FROM crime_reports ORDER BY crime_id; -- Listing 14-10: Matching case number, date, crime type, and city SELECT regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number, regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') AS date_1, regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type, regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n') AS city FROM crime_reports ORDER BY crime_id; -- Bonus: Get all parsed elements at once SELECT crime_id, regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}') AS date_1, CASE WHEN EXISTS (SELECT regexp_matches(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})')) THEN regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})') ELSE NULL END AS date_2, regexp_match(original_text, '\/\d{2}\n(\d{4})') AS hour_1, CASE WHEN EXISTS (SELECT regexp_matches(original_text, '\/\d{2}\n\d{4}-(\d{4})')) THEN regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') ELSE NULL END AS hour_2, regexp_match(original_text, 'hrs.\n(\d+ .+(?:Sq.|Plz.|Dr.|Ter.|Rd.))') AS street, regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n') AS city, regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):') AS crime_type, regexp_match(original_text, ':\s(.+)(?:C0|SO)') AS description, regexp_match(original_text, '(?:C0|SO)[0-9]+') AS case_number FROM crime_reports ORDER BY crime_id; -- Listing 14-11: Retrieving a value from within an array SELECT crime_id, (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1] AS case_number FROM crime_reports ORDER BY crime_id; -- Listing 14-12: Updating the crime_reports date_1 column UPDATE crime_reports SET date_1 = ( (regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1] || ' ' || (regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1] ||' US/Eastern' )::timestamptz RETURNING crime_id, date_1, original_text; -- Listing 14-13: Updating all crime_reports columns UPDATE crime_reports SET date_1 = ( (regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1] || ' ' || (regexp_match(original_text, '\/\d{2}\n(\d{4})'))[1] ||' US/Eastern' )::timestamptz, date_2 = CASE -- if there is no second date but there is a second hour WHEN (SELECT regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})') IS NULL) AND (SELECT regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') IS NOT NULL) THEN ((regexp_match(original_text, '\d{1,2}\/\d{1,2}\/\d{2}'))[1] || ' ' || (regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})'))[1] ||' US/Eastern' )::timestamptz -- if there is both a second date and second hour WHEN (SELECT regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})') IS NOT NULL) AND (SELECT regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})') IS NOT NULL) THEN ((regexp_match(original_text, '-(\d{1,2}\/\d{1,2}\/\d{2})'))[1] || ' ' || (regexp_match(original_text, '\/\d{2}\n\d{4}-(\d{4})'))[1] ||' US/Eastern' )::timestamptz END, street = (regexp_match(original_text, 'hrs.\n(\d+ .+(?:Sq.|Plz.|Dr.|Ter.|Rd.))'))[1], city = (regexp_match(original_text, '(?:Sq.|Plz.|Dr.|Ter.|Rd.)\n(\w+ \w+|\w+)\n'))[1], crime_type = (regexp_match(original_text, '\n(?:\w+ \w+|\w+)\n(.*):'))[1], description = (regexp_match(original_text, ':\s(.+)(?:C0|SO)'))[1], case_number = (regexp_match(original_text, '(?:C0|SO)[0-9]+'))[1]; -- Listing 14-14: Viewing selected crime data SELECT date_1, street, city, crime_type FROM crime_reports ORDER BY crime_id; -- FULL TEXT SEARCH -- Full-text search operators: -- & (AND) -- | (OR) -- ! (NOT) -- Note: You can view installed PostgreSQL search language configurations by running: SELECT cfgname FROM pg_ts_config; -- Listing 14-15: Converting text to tsvector data SELECT to_tsvector('english', 'I am walking across the sitting room to sit with you.'); -- Listing 14-16: Converting search terms to tsquery data SELECT to_tsquery('english', 'walking & sitting'); -- Listing 14-17: Querying a tsvector type with a tsquery SELECT to_tsvector('english', 'I am walking across the sitting room') @@ to_tsquery('english', 'walking & sitting'); SELECT to_tsvector('english', 'I am walking across the sitting room') @@ to_tsquery('english', 'walking & running'); -- Listing 14-18: Creating and filling the president_speeches table -- Sources: -- https://archive.org/details/State-of-the-Union-Addresses-1945-2006 -- https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union -- https://www.eisenhower.archives.gov/all_about_ike/speeches.html CREATE TABLE president_speeches ( president text NOT NULL, title text NOT NULL, speech_date date NOT NULL, speech_text text NOT NULL, search_speech_text tsvector, CONSTRAINT speech_key PRIMARY KEY (president, speech_date) ); COPY president_speeches (president, title, speech_date, speech_text) FROM 'C:\YourDirectory\president_speeches.csv' WITH (FORMAT CSV, DELIMITER '|', HEADER OFF, QUOTE '@'); SELECT * FROM president_speeches ORDER BY speech_date; -- Listing 14-19: Converting speeches to tsvector in the search_speech_text column UPDATE president_speeches SET search_speech_text = to_tsvector('english', speech_text); -- Listing 14-20: Creating a GIN index for text search CREATE INDEX search_idx ON president_speeches USING gin(search_speech_text); -- Listing 14-21: Finding speeches containing the word "Vietnam" SELECT president, speech_date FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'Vietnam') ORDER BY speech_date; -- Listing 14-22: Displaying search results with ts_headline() SELECT president, speech_date, ts_headline(speech_text, to_tsquery('english', 'tax'), 'StartSel = <, StopSel = >, MinWords=5, MaxWords=7, MaxFragments=1') FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'tax') ORDER BY speech_date; -- Listing 14-23: Finding speeches with the word "transportation" but not "roads" SELECT president, speech_date, ts_headline(speech_text, to_tsquery('english', 'transportation & !roads'), 'StartSel = <, StopSel = >, MinWords=5, MaxWords=7, MaxFragments=1') FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'transportation & !roads') ORDER BY speech_date; -- Listing 14-24: Finding speeches where "defense" follows "military" SELECT president, speech_date, ts_headline(speech_text, to_tsquery('english', 'military <-> defense'), 'StartSel = <, StopSel = >, MinWords=5, MaxWords=7, MaxFragments=1') FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'military <-> defense') ORDER BY speech_date; -- Bonus: Example with a distance of 2: SELECT president, speech_date, ts_headline(speech_text, to_tsquery('english', 'military <2> defense'), 'StartSel = <, StopSel = >, MinWords=5, MaxWords=7, MaxFragments=2') FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'military <2> defense') ORDER BY speech_date; -- Listing 14-25: Scoring relevance with ts_rank() SELECT president, speech_date, ts_rank(search_speech_text, to_tsquery('english', 'war & security & threat & enemy')) AS score FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'war & security & threat & enemy') ORDER BY score DESC LIMIT 5; -- Listing 14-26: Normalizing ts_rank() by speech length SELECT president, speech_date, ts_rank(search_speech_text, to_tsquery('english', 'war & security & threat & enemy'), 2)::numeric AS score FROM president_speeches WHERE search_speech_text @@ to_tsquery('english', 'war & security & threat & enemy') ORDER BY score DESC LIMIT 5;