WITH RECURSIVE "gpg_siccode_recurs" AS
(
SELECT
1 AS "counter"
, x.gpgkey
, x.employerid
--* The recursive query will extract SIC codes from this field.
, x.siccode_array
--* The siccodes field is delimited by a command and a line-feed. The inner query
--* below creates an array using the line-feed. Here we remove the comma and pad
--* each sic code to five characters.
, RIGHT( '00000' || REPLACE( x.siccode_array[1], ',', '' ), 5 ) AS "siccode"
--* In the recursion query, we will limit recursion to the number of elements
--* in the array. So we need a count of array elements.
, ARRAY_LENGTH( x.siccode_array, 1 ) AS "siccode_count"
FROM (
SELECT
gpgkey
, employerid
, siccodes
--* The siccodes field is delimited by comma and a line-feed (Ascii 10)
, STRING_TO_ARRAY( siccodes, CHR(10) ) AS "siccode_array"
FROM gender_pay_gap
) AS x
WHERE
ARRAY_LENGTH( x.siccode_array, 1 ) >= 1
--* SIC Codes of '0' and '1' are not legitimate SIC Codes.
AND REPLACE( x.siccode_array[1], ',', '' ) NOT IN ( '0', '1' )
UNION ALL SELECT
y.counter + 1 AS "counter"
, y.gpgkey
, y.employerid
, y.siccode_array
, RIGHT( '00000' || REPLACE( y.siccode_array[y.counter + 1], ',', '' ), 5 )
AS "siccode"
, ARRAY_LENGTH( y.siccode_array, 1 ) AS "siccode_count"
FROM gpg_siccode_recurs y
WHERE
ARRAY_LENGTH( y.siccode_array, 1 ) >= y.counter + 1
--* SIC Codes of '0' and '1' are not legitimate SIC Codes.
AND REPLACE( y.siccode_array[y.counter + 1], ',', '' ) NOT IN ( '0', '1' )
)
SELECT
gpgkey
, counter
, employerid
, siccode
, LEFT( siccode, 2 ) AS sicdivision
, LEFT( siccode, 3 ) AS sicgroup
, LEFT( siccode, 4 ) AS sicclass
, siccode_count
FROM gpg_siccode_recurs
ORDER BY gpgkey, counter
;