Query

#standardSQL

CREATE TEMP FUNCTION label_trend(monthlys ARRAY<STRUCT<month TIMESTAMP, c INT64>>) AS ((

SELECT CASE

WHEN c<36 AND recent_peak_corr>0 AND recent_peak_corr>growing_corr AND recent_peak_corr > ABS(comeback_corr) THEN 'recent_peak'

WHEN growing_corr>0 AND (c<36 OR growing_corr>ABS(comeback_corr)) THEN 'growing'

WHEN -1*growing_corr>ABS(comeback_corr) OR c<36 THEN 'declining'

WHEN comeback_corr>0 THEN 'comeback'

ELSE 'peaked'

END

FROM (

(SELECT AS STRUCT

CORR((IFNULL(x.c,0.1)), trends.growing) AS growing_corr

, CORR((IFNULL(x.c,0.1)), trends.comeback) AS comeback_corr

, CORR((IFNULL(x.c,0.1)), trends.recent_peak) AS recent_peak_corr

, COUNT(*) AS c

FROM UNNEST(monthlys) x

JOIN (

SELECT *, ABS(growing-54/2) comeback, -1*ABS(LEAST (0,74/2-growing)) recent_peak

FROM UNNEST(GENERATE_DATE_ARRAY('2013-01-01', '2017-06-01', INTERVAL 1 MONTH)) AS m WITH OFFSET growing

) trends

ON DATE(x.month)=trends.m))

)); WITH extract_users AS (

SELECT subreddit, author, REGEXP_EXTRACT_ALL(LOWER(body), r'/u/[a-zA-Z0-9-_]{3,20}') users, body, TIMESTAMP_TRUNC(TIMESTAMP_SECONDS(created_utc), MONTH) month

FROM `fh-bigquery.reddit_comments.20*`

WHERE _TABLE_SUFFIX >= '13'

) SELECT * EXCEPT(monthlys), label_trend( monthlys) trend

FROM (

SELECT *

FROM (

SELECT user, month, COUNT(*) c

, APPROX_TOP_COUNT(sub, 1)[OFFSET(0)].value top_sub

, ANY_VALUE(mentioned_by) mentioned_by

, ARRAY_AGG(STRUCT(month, COUNT(*) AS c)) OVER(PARTITION BY user) monthlys

FROM (

SELECT author, user, month, COUNT(*) c

, APPROX_TOP_COUNT(subreddit, 1)[OFFSET(0)].value sub

, COUNT(DISTINCT author) OVER(PARTITION BY user) mentioned_by

FROM extract_users, UNNEST(users) user

GROUP BY 1, 2, 3

)

WHERE mentioned_by>800

GROUP BY 1, 2

)

)

ORDER BY c DESC

Query notes

I played with CORR() to cluster by trends. Interesting results, but I still want to play a little more with it.

Note the use of SQL UDFs and WITH subqueries.

OVER() helps me find the total number of mentions over all the years for each user, and also aggregate series for the correlation experiment.

Basic queries

Extract all username mentions:

WITH extract_users AS (

SELECT subreddit, author, REGEXP_EXTRACT_ALL(LOWER(body), r'/u/[a-zA-Z0-9-_]{3,20}') users, body, TIMESTAMP_TRUNC(TIMESTAMP_SECONDS(created_utc), MONTH) month

FROM `fh-bigquery.reddit_comments.20*`

WHERE _TABLE_SUFFIX >= '13'

)

Normalization: If an account was mentioned multiple times by the same user, it counts only once. Find also what was the top subreddit that user used that month to mention the account. Do it by month, but also do an OVER() to get the total number of mentioners across all months:

SELECT author, user, month, COUNT(*) c

, APPROX_TOP_COUNT(subreddit, 1)[OFFSET(0)].value sub

, COUNT(DISTINCT author) OVER(PARTITION BY user) mentioned_by

FROM extract_users, UNNEST(users) user

GROUP BY 1, 2, 3

Then count the number of mentioners each month, find the subreddit that gave the most mentions to that account, and re-use the total number of mentions we got on the previous query. Then keep only the accounts that have received at least 800 mentions by different users:

SELECT user, month, COUNT(*) c

, APPROX_TOP_COUNT(sub, 1)[OFFSET(0)].value top_sub

, ANY_VALUE(mentioned_by) mentioned_by

FROM (

[see previous query]

)

WHERE mentioned_by>800

GROUP BY 1, 2

Bonus

Next steps

Discuss on reddit:

Want more stories? Check my Medium, follow me on twitter, and subscribe to reddit.com/r/bigquery. And try BigQuery — every month you get a full terabyte of analysis for free.