Since all my scraped data is in Postgres, this is the easiest way to get it out – the fastest iteration possible. At some point I’ll probably switch to a Java library. It’s interesting to see, but probably the only lesson from this is that all ETL scripts are ugly.
WITH advertisers_ranked AS ( SELECT advertiser_id, REPLACE(REPLACE(LOWER(advertiser), ' ', '_'), '/', '_') advertiser, 6 + dense_rank() OVER (partition BY 1 ORDER BY advertiser) advertiser_rank -- 6 for the number of attributes prior to the 'advertiser' attributes FROM advertisers ) SELECT '@RELATION flippa' line UNION ALL SELECT '@ATTRIBUTE default numeric' line UNION ALL SELECT '@ATTRIBUTE siteid string' line UNION ALL SELECT '@ATTRIBUTE banned {0,1}' line UNION ALL SELECT '@ATTRIBUTE length numeric' line UNION ALL SELECT '@ATTRIBUTE h1 numeric' line UNION ALL SELECT '@ATTRIBUTE h2 numeric' line UNION ALL SELECT '@ATTRIBUTE h3 numeric' line UNION ALL (SELECT '@ATTRIBUTE ' || advertiser || ' {0, 1}' line FROM advertisers_ranked ORDER BY advertiser_rank) UNION ALL SELECT '@DATA' line UNION ALL -- there are N advertisers per row, this combines them into one SELECT '{' || siteid || ', ' || banned || ', ' || LENGTH || ', ' || h1 || ', ' || h2 || ', ' || h3 || ', ' || array_to_string(array_agg(advertiser ORDER BY advertiser_rank), ', ') || '}' line FROM ( SELECT DISTINCT '1 ' || s.site_id siteid, '2 ' || (CASE WHEN seller LIKE '%banned%' THEN 1 ELSE 0 END) AS banned, '3 ' || CHAR_LENGTH(description) LENGTH, '4 ' || (LENGTH(description) - LENGTH(regexp_replace(LOWER(description),'h1','','g'))) / LENGTH('h1') h1, '5 ' || (LENGTH(description) - LENGTH(regexp_replace(LOWER(description),'h2','','g'))) / LENGTH('h2') h2, '6 ' || (LENGTH(description) - LENGTH(regexp_replace(LOWER(description),'h3','','g'))) / LENGTH('h3') h3, advertiser_rank || ' 1' advertiser, advertiser_rank FROM sites s JOIN sites_advertisers ON s.site_id = sites_advertisers.site_id JOIN advertisers_ranked a ON a.advertiser_id = sites_advertisers.advertiser_id JOIN auctions ON auctions.site_id = s.site_id ) a GROUP BY siteid, banned, LENGTH, h1, h2, h3 |