Quantcast
Channel: Gary Sieling » Uncategorized
Viewing all articles
Browse latest Browse all 12

Generating ARFF files for Weka from Postgres

$
0
0

Since all my scraped data is in Postgres, this is the easiest way to get it out – the fastest iteration possible. At some point I’ll probably switch to a Java library. It’s interesting to see, but probably the only lesson from this is that all ETL scripts are ugly.

WITH advertisers_ranked AS (
	SELECT advertiser_id, REPLACE(REPLACE(LOWER(advertiser), ' ', '_'), '/', '_') advertiser,
	6 + dense_rank() OVER (partition BY 1 ORDER BY advertiser) advertiser_rank -- 6 for the number of attributes prior to the 'advertiser' attributes
	FROM advertisers
)
SELECT '@RELATION flippa' line
UNION ALL
SELECT '@ATTRIBUTE default numeric' line
UNION ALL
SELECT '@ATTRIBUTE siteid string' line
UNION ALL
SELECT '@ATTRIBUTE banned {0,1}' line
UNION ALL
SELECT '@ATTRIBUTE length numeric' line
UNION ALL
SELECT '@ATTRIBUTE h1 numeric' line
UNION ALL
SELECT '@ATTRIBUTE h2 numeric' line
UNION ALL
SELECT '@ATTRIBUTE h3 numeric' line
UNION ALL
(SELECT '@ATTRIBUTE ' || advertiser || ' {0, 1}' line
FROM advertisers_ranked ORDER BY advertiser_rank)
UNION ALL
SELECT '@DATA' line
UNION ALL
-- there are N advertisers per row, this combines them into one
SELECT '{' || siteid || ', ' || banned || ', ' || LENGTH || ', ' || h1 || ', ' || h2 || ', ' || h3 || ', ' || array_to_string(array_agg(advertiser ORDER BY advertiser_rank), ', ') || '}' line
FROM (
	SELECT DISTINCT
	        '1 ' || s.site_id siteid,
		'2 ' || (CASE WHEN seller LIKE '%banned%' THEN 1 ELSE 0 END) AS banned,
		'3 ' || CHAR_LENGTH(description) LENGTH,
		'4 ' || (LENGTH(description) - LENGTH(regexp_replace(LOWER(description),'h1','','g'))) / LENGTH('h1') h1,
		'5 ' || (LENGTH(description) - LENGTH(regexp_replace(LOWER(description),'h2','','g'))) / LENGTH('h2') h2,
		'6 ' || (LENGTH(description) - LENGTH(regexp_replace(LOWER(description),'h3','','g'))) / LENGTH('h3') h3,
		advertiser_rank || ' 1' advertiser,
		advertiser_rank
	FROM sites s
	JOIN sites_advertisers ON s.site_id = sites_advertisers.site_id
	JOIN advertisers_ranked a ON a.advertiser_id = sites_advertisers.advertiser_id
	JOIN auctions ON auctions.site_id = s.site_id
	) a
GROUP BY siteid, banned, LENGTH, h1, h2, h3

Viewing all articles
Browse latest Browse all 12

Trending Articles