Advanced SQL Patterns
These are the problems that stump most candidates. Gaps and islands, sessionization, funnel analysis, and cohort retention appear at senior-level interviews at top companies. Master these and you will stand out.
Challenge 1: Gaps and Islands
Schema
CREATE TABLE server_status (
log_date DATE PRIMARY KEY,
status VARCHAR(10)
);
INSERT INTO server_status VALUES
('2024-01-01', 'up'), ('2024-01-02', 'up'), ('2024-01-03', 'up'),
('2024-01-04', 'down'), ('2024-01-05', 'down'),
('2024-01-06', 'up'), ('2024-01-07', 'up'), ('2024-01-08', 'up'), ('2024-01-09', 'up'),
('2024-01-10', 'down'),
('2024-01-11', 'up'), ('2024-01-12', 'up');
Solution
-- The key insight: subtract a row number from the date.
-- Consecutive dates with the same status will produce the same "group identifier."
WITH grouped AS (
SELECT
log_date,
status,
log_date - CAST(ROW_NUMBER() OVER (PARTITION BY status ORDER BY log_date) AS INT) * INTERVAL '1 day' AS grp
FROM server_status
)
SELECT
status,
MIN(log_date) AS period_start,
MAX(log_date) AS period_end,
COUNT(*) AS duration_days
FROM grouped
GROUP BY status, grp
ORDER BY period_start;
-- Result:
-- status | period_start | period_end | duration_days
-- -------+--------------+------------+--------------
-- up | 2024-01-01 | 2024-01-03 | 3
-- down | 2024-01-04 | 2024-01-05 | 2
-- up | 2024-01-06 | 2024-01-09 | 4
-- down | 2024-01-10 | 2024-01-10 | 1
-- up | 2024-01-11 | 2024-01-12 | 2
How it works: For consecutive dates with the same status, subtracting an incrementing row number produces the same value. For example, Jan 1 minus row 1, Jan 2 minus row 2, Jan 3 minus row 3 all give Dec 31. This common difference becomes the group key. This is the classic "gaps and islands" technique.
Challenge 2: Pivot / Unpivot
Schema
CREATE TABLE monthly_sales (
salesperson VARCHAR(50),
month VARCHAR(10),
revenue DECIMAL(10,2)
);
INSERT INTO monthly_sales VALUES
('Alice', 'Jan', 15000), ('Alice', 'Feb', 18000), ('Alice', 'Mar', 12000),
('Bob', 'Jan', 12000), ('Bob', 'Feb', 14000), ('Bob', 'Mar', 16000),
('Charlie', 'Jan', 9000), ('Charlie', 'Feb', 11000), ('Charlie', 'Mar', 13000);
Solution
-- PIVOT using CASE WHEN (works in all SQL dialects)
SELECT
salesperson,
SUM(CASE WHEN month = 'Jan' THEN revenue ELSE 0 END) AS jan_revenue,
SUM(CASE WHEN month = 'Feb' THEN revenue ELSE 0 END) AS feb_revenue,
SUM(CASE WHEN month = 'Mar' THEN revenue ELSE 0 END) AS mar_revenue,
SUM(revenue) AS total
FROM monthly_sales
GROUP BY salesperson
ORDER BY total DESC;
-- Result:
-- salesperson | jan_revenue | feb_revenue | mar_revenue | total
-- ------------+-------------+-------------+-------------+------
-- Alice | 15000 | 18000 | 12000 | 45000
-- Bob | 12000 | 14000 | 16000 | 42000
-- Charlie | 9000 | 11000 | 13000 | 33000
-- UNPIVOT using UNION ALL (works in all dialects)
-- Assuming we have the pivoted table as 'pivoted_sales':
WITH pivoted AS (
SELECT salesperson,
SUM(CASE WHEN month = 'Jan' THEN revenue END) AS jan,
SUM(CASE WHEN month = 'Feb' THEN revenue END) AS feb,
SUM(CASE WHEN month = 'Mar' THEN revenue END) AS mar
FROM monthly_sales GROUP BY salesperson
)
SELECT salesperson, 'Jan' AS month, jan AS revenue FROM pivoted
UNION ALL
SELECT salesperson, 'Feb', feb FROM pivoted
UNION ALL
SELECT salesperson, 'Mar', mar FROM pivoted
ORDER BY salesperson, month;
Key insight: PIVOT with CASE WHEN is the most portable approach. SQL Server has native PIVOT/UNPIVOT syntax. PostgreSQL has crosstab() in the tablefunc extension. But the CASE WHEN approach works everywhere and is what interviewers expect you to know.
Challenge 3: Sessionization
Schema
CREATE TABLE user_events (
event_id INT PRIMARY KEY,
user_id INT,
event_time TIMESTAMP,
event_type VARCHAR(50)
);
INSERT INTO user_events VALUES
(1, 1, '2024-01-15 09:00:00', 'page_view'),
(2, 1, '2024-01-15 09:05:00', 'page_view'),
(3, 1, '2024-01-15 09:12:00', 'add_to_cart'),
(4, 1, '2024-01-15 09:15:00', 'checkout'),
(5, 1, '2024-01-15 14:00:00', 'page_view'),
(6, 1, '2024-01-15 14:10:00', 'page_view'),
(7, 1, '2024-01-15 14:55:00', 'add_to_cart'),
(8, 2, '2024-01-15 10:00:00', 'page_view'),
(9, 2, '2024-01-15 10:20:00', 'page_view'),
(10, 2, '2024-01-15 11:30:00', 'page_view');
Solution
WITH with_prev AS (
SELECT
event_id,
user_id,
event_time,
event_type,
LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_time
FROM user_events
),
with_boundary AS (
SELECT
*,
CASE
WHEN prev_time IS NULL THEN 1 -- first event = new session
WHEN EXTRACT(EPOCH FROM (event_time - prev_time)) / 60 > 30 THEN 1 -- gap > 30 min
ELSE 0
END AS is_new_session
FROM with_prev
),
with_session_id AS (
SELECT
*,
SUM(is_new_session) OVER (
PARTITION BY user_id ORDER BY event_time
) AS session_id
FROM with_boundary
)
-- Session-level metrics
SELECT
user_id,
session_id,
MIN(event_time) AS session_start,
MAX(event_time) AS session_end,
ROUND(EXTRACT(EPOCH FROM (MAX(event_time) - MIN(event_time))) / 60, 1) AS duration_min,
COUNT(*) AS num_events,
STRING_AGG(event_type, ' > ' ORDER BY event_time) AS event_flow
FROM with_session_id
GROUP BY user_id, session_id
ORDER BY user_id, session_id;
-- Result:
-- user_id | session_id | session_start | session_end | duration_min | num_events | event_flow
-- --------+------------+---------------------+---------------------+--------------+------------+----------------------------------
-- 1 | 1 | 2024-01-15 09:00:00 | 2024-01-15 09:15:00 | 15.0 | 4 | page_view > page_view > add_to_cart > checkout
-- 1 | 2 | 2024-01-15 14:00:00 | 2024-01-15 14:55:00 | 55.0 | 3 | page_view > page_view > add_to_cart
-- 2 | 1 | 2024-01-15 10:00:00 | 2024-01-15 10:20:00 | 20.0 | 2 | page_view > page_view
-- 2 | 2 | 2024-01-15 11:30:00 | 2024-01-15 11:30:00 | 0.0 | 1 | page_view
Key insight: The sessionization pattern uses three steps: (1) LAG to find the previous event time, (2) flag rows where the gap exceeds the threshold, (3) cumulative SUM of flags to create session IDs. This same pattern is used in Google Analytics, Amplitude, and every product analytics platform.
Challenge 4: Funnel Analysis
Solution
WITH funnel_steps AS (
SELECT
user_id,
MAX(CASE WHEN event_type = 'page_view' THEN 1 ELSE 0 END) AS viewed,
MAX(CASE WHEN event_type = 'add_to_cart' THEN 1 ELSE 0 END) AS added_to_cart,
MAX(CASE WHEN event_type = 'checkout' THEN 1 ELSE 0 END) AS checked_out
FROM user_events
GROUP BY user_id
),
funnel_counts AS (
SELECT
SUM(viewed) AS step1_viewers,
SUM(added_to_cart) AS step2_carts,
SUM(checked_out) AS step3_checkouts
FROM funnel_steps
)
SELECT
'page_view' AS step,
step1_viewers AS users,
100.0 AS step_conversion_pct,
100.0 AS overall_conversion_pct
FROM funnel_counts
UNION ALL
SELECT
'add_to_cart',
step2_carts,
ROUND(100.0 * step2_carts / NULLIF(step1_viewers, 0), 1),
ROUND(100.0 * step2_carts / NULLIF(step1_viewers, 0), 1)
FROM funnel_counts
UNION ALL
SELECT
'checkout',
step3_checkouts,
ROUND(100.0 * step3_checkouts / NULLIF(step2_carts, 0), 1),
ROUND(100.0 * step3_checkouts / NULLIF(step1_viewers, 0), 1)
FROM funnel_counts;
-- Result:
-- step | users | step_conversion_pct | overall_conversion_pct
-- -------------+-------+---------------------+-----------------------
-- page_view | 2 | 100.0 | 100.0
-- add_to_cart | 1 | 50.0 | 50.0
-- checkout | 1 | 100.0 | 50.0
Key insight: Use NULLIF(denominator, 0) to prevent division-by-zero errors. The funnel uses MAX(CASE WHEN) to determine if a user ever performed each action. For ordered funnels (user must do step 1 before step 2), add timestamp comparisons: AND cart_time > view_time.
Challenge 5: Cohort Retention Analysis
Schema
CREATE TABLE user_purchases (
purchase_id INT PRIMARY KEY,
user_id INT,
purchase_date DATE,
amount DECIMAL(10,2)
);
INSERT INTO user_purchases VALUES
(1, 1, '2024-01-05', 50), (2, 2, '2024-01-10', 30), (3, 3, '2024-01-15', 80),
(4, 4, '2024-01-20', 45), (5, 5, '2024-02-01', 60),
(6, 1, '2024-02-08', 35), (7, 2, '2024-02-15', 70), (8, 6, '2024-02-10', 90),
(9, 1, '2024-03-05', 55), (10, 3, '2024-03-10', 40), (11, 5, '2024-03-12', 65),
(12, 7, '2024-03-01', 25), (13, 2, '2024-04-01', 80), (14, 1, '2024-04-10', 90);
Solution
WITH first_purchase AS (
SELECT
user_id,
DATE_TRUNC('month', MIN(purchase_date)) AS cohort_month
FROM user_purchases
GROUP BY user_id
),
user_activity AS (
SELECT DISTINCT
up.user_id,
fp.cohort_month,
DATE_TRUNC('month', up.purchase_date) AS activity_month
FROM user_purchases up
INNER JOIN first_purchase fp ON up.user_id = fp.user_id
),
retention AS (
SELECT
cohort_month,
EXTRACT(YEAR FROM AGE(activity_month, cohort_month)) * 12 +
EXTRACT(MONTH FROM AGE(activity_month, cohort_month)) AS months_since_signup,
COUNT(DISTINCT user_id) AS active_users
FROM user_activity
GROUP BY cohort_month, activity_month
),
cohort_size AS (
SELECT cohort_month, COUNT(*) AS total_users
FROM first_purchase
GROUP BY cohort_month
)
SELECT
r.cohort_month,
cs.total_users AS cohort_size,
r.months_since_signup,
r.active_users,
ROUND(100.0 * r.active_users / cs.total_users, 1) AS retention_pct
FROM retention r
INNER JOIN cohort_size cs ON r.cohort_month = cs.cohort_month
ORDER BY r.cohort_month, r.months_since_signup;
-- Result:
-- cohort_month | cohort_size | months_since | active_users | retention_pct
-- -------------+-------------+--------------+--------------+--------------
-- 2024-01-01 | 4 | 0 | 4 | 100.0
-- 2024-01-01 | 4 | 1 | 2 | 50.0
-- 2024-01-01 | 4 | 2 | 2 | 50.0
-- 2024-01-01 | 4 | 3 | 2 | 50.0
-- 2024-02-01 | 2 | 0 | 2 | 100.0
-- 2024-02-01 | 2 | 1 | 1 | 50.0
-- 2024-03-01 | 1 | 0 | 1 | 100.0
Key insight: Cohort retention is the most important product metric for subscription and e-commerce businesses. The pattern is: (1) find each user's cohort (first activity month), (2) calculate months since signup for each activity, (3) count distinct users per cohort per period, (4) divide by cohort size. This is a top-5 most-asked SQL problem at data-focused companies.
Lilly Tech Systems