quantified_features is useful for transforming values of non-number columns to indexed numbers.

Note: The feature is supported from Hivemall v0.4 or later.

desc train;

id                      int
age                     int
job                     string
marital                 string
education               string
default                 string
balance                 int
housing                 string
loan                    string
contact                 string
day                     int
month                   string
duration                int
campaign                int
pdays                   int
previous                int
poutcome                string
y                       int

select * from train limit 10;

1       39      blue-collar     married secondary       no      1756    yes     no      cellular        3       apr     939     1       -1      0       unknown 1
2       51      entrepreneur    married primary no      1443    no      no      cellular        18      feb     172     10      -1      0       unknown 1
3       36      management      single  tertiary        no      436     no      no      cellular        13      apr     567     1       595     2       failure 1
4       63      retired married secondary       no      474     no      no      cellular        25      jan     423     1       -1      0       unknown 1
5       31      management      single  tertiary        no      354     no      no      cellular        30      apr     502     1       9       2       success 1
6       29      blue-collar     single  secondary       no      260     yes     no      unknown 2       jun     707     14      -1      0       unknown 1
7       37      services        married secondary       no      52      yes     no      cellular        6       sep     908     1       185     9       success 1
8       32      technician      single  secondary       no      230     yes     no      cellular        18      may     442     1       266     8       failure 1
9       31      admin.  single  secondary       no      0       yes     no      cellular        7       may     895     2       295     2       failure 1
10      32      self-employed   single  tertiary        no      1815    no      no      telephone       10      aug     235     1       102     2       failure 1

set hivevar:output_row=true;

select quantify(${output_row}, *) from ( select * from train order by id asc -- force quantify() runs on a single reducer ) t limit 10; 1 39 0 0 0 0 1756 0 0 0 3 0 939 1 -1 0 0 1 2 51 1 0 1 0 1443 1 0 0 18 1 172 10 -1 0 0 1 3 36 2 1 2 0 436 1 0 0 13 0 567 1 595 2 1 1 4 63 3 0 0 0 474 1 0 0 25 2 423 1 -1 0 0 1 5 31 2 1 2 0 354 1 0 0 30 0 502 1 9 2 2 1 6 29 0 1 0 0 260 0 0 1 2 3 707 14 -1 0 0 1 7 37 4 0 0 0 52 0 0 0 6 4 908 1 185 9 2 1 8 32 5 1 0 0 230 0 0 0 18 5 442 1 266 8 1 1 9 31 6 1 0 0 0 0 0 0 7 5 895 2 295 2 1 1 10 32 7 1 2 0 1815 1 0 2 10 6 235 1 102 2 1 1  select quantify(${output_row}, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, cast(pdays as string), previous, poutcome, y
) as (id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome, y)
from (
select * from train
order by id asc
) t
limit 10;

1       39      0       0       0       0       1756    0       0       0       3       0       939     1       0       0       0       1
2       51      1       0       1       0       1443    1       0       0       18      1       172     10      0       0       0       1
3       36      2       1       2       0       436     1       0       0       13      0       567     1       1       2       1       1
4       63      3       0       0       0       474     1       0       0       25      2       423     1       0       0       0       1
5       31      2       1       2       0       354     1       0       0       30      0       502     1       2       2       2       1
6       29      0       1       0       0       260     0       0       1       2       3       707     14      0       0       0       1
7       37      4       0       0       0       52      0       0       0       6       4       908     1       3       9       2       1
8       32      5       1       0       0       230     0       0       0       18      5       442     1       4       8       1       1
9       31      6       1       0       0       0       0       0       0       7       5       895     2       5       2       1       1
10      32      7       1       2       0       1815    1       0       2       10      6       235     1       6       2       1       1

select
quantified_features(
\${output_row}, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, cast(pdays as string), previous, poutcome, y
) as features
from (
select * from train
order by id asc
) t
limit 10;

[1.0,39.0,0.0,0.0,0.0,0.0,1756.0,0.0,0.0,0.0,3.0,0.0,939.0,1.0,0.0,0.0,0.0,1.0]
[2.0,51.0,1.0,0.0,1.0,0.0,1443.0,1.0,0.0,0.0,18.0,1.0,172.0,10.0,0.0,0.0,0.0,1.0]
[3.0,36.0,2.0,1.0,2.0,0.0,436.0,1.0,0.0,0.0,13.0,0.0,567.0,1.0,1.0,2.0,1.0,1.0]
[4.0,63.0,3.0,0.0,0.0,0.0,474.0,1.0,0.0,0.0,25.0,2.0,423.0,1.0,0.0,0.0,0.0,1.0]
[5.0,31.0,2.0,1.0,2.0,0.0,354.0,1.0,0.0,0.0,30.0,0.0,502.0,1.0,2.0,2.0,2.0,1.0]
[6.0,29.0,0.0,1.0,0.0,0.0,260.0,0.0,0.0,1.0,2.0,3.0,707.0,14.0,0.0,0.0,0.0,1.0]
[7.0,37.0,4.0,0.0,0.0,0.0,52.0,0.0,0.0,0.0,6.0,4.0,908.0,1.0,3.0,9.0,2.0,1.0]
[8.0,32.0,5.0,1.0,0.0,0.0,230.0,0.0,0.0,0.0,18.0,5.0,442.0,1.0,4.0,8.0,1.0,1.0]
[9.0,31.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,895.0,2.0,5.0,2.0,1.0,1.0]
[10.0,32.0,7.0,1.0,2.0,0.0,1815.0,1.0,0.0,2.0,10.0,6.0,235.0,1.0,6.0,2.0,1.0,1.0]


## Quantify test dataset

select * from test limit 10;

1       30      management      single  tertiary        no      1028    no      no      cellular        4       feb     1294    2       -1      0       unknown
2       39      self-employed   single  tertiary        no      426     no      no      unknown 18      jun     1029    1       -1      0       unknown
3       38      technician      single  tertiary        no      -572    yes     yes     unknown 5       jun     26      24      -1      0       unknown
4       34      technician      single  secondary       no      -476    yes     no      unknown 27      may     92      4       -1      0       unknown
5       37      entrepreneur    married primary no      62      no      no      cellular        31      jul     404     2       -1      0       unknown
6       43      services        married primary no      574     yes     no      cellular        8       may     140     1       -1      0       unknown
7       54      technician      married secondary       no      324     yes     no      telephone       13      may     51      1       -1      0       unknown
8       41      blue-collar     married secondary       no      121     yes     no      cellular        13      may     16      6       176     5       other
9       52      housemaid       married primary no      1466    no      yes     cellular        20      nov     150     1       -1      0       unknown
10      32      management      married secondary       no      6217    yes     yes     cellular        18      nov     486     2       181     2       failure

select
id,
array(age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome) as features
from (
select
quantify(
output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, if(pdays==-1,0,pdays), previous, poutcome
) as (id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome)
from (
select * from (
select
1 as train_first, false as output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome
from
train
union all
select
2 as train_first, true as output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome
from
test
) t0
order by train_first, id asc
) t1
) t2
limit 10;

1       [30,2,1,2,0,1028,1,0,0,4,1,1294,2,0,0,0]
2       [39,7,1,2,0,426,1,0,1,18,3,1029,1,0,0,0]
3       [38,5,1,2,0,-572,0,1,1,5,3,26,24,0,0,0]
4       [34,5,1,0,0,-476,0,0,1,27,5,92,4,0,0,0]
5       [37,1,0,1,0,62,1,0,0,31,8,404,2,0,0,0]
6       [43,4,0,1,0,574,0,0,0,8,5,140,1,0,0,0]
7       [54,5,0,0,0,324,0,0,2,13,5,51,1,0,0,0]
8       [41,0,0,0,0,121,0,0,0,13,5,16,6,176,5,3]
9       [52,8,0,1,0,1466,1,1,0,20,9,150,1,0,0,0]
10      [32,2,0,0,0,6217,0,1,0,18,9,486,2,181,2,1]