This page explains how to run matrix factorization on MovieLens 1M dataset.

## Calculate the mean rating in the training dataset

use movielens;

select avg(rating) from training;


3.593565

## Set variables (hyperparameters) for training

-- mean rating
set hivevar:mu=3.593565;
-- number of factors
set hivevar:factor=10;
-- maximum number of training iterations
set hivevar:iters=50;


Note that there are no need to set an exact value for $mu. It actually works without setting $mu but recommended to set one for getting a better prediction.

Due to a bug in Hive, do not issue comments in CLI.

## Training

create table sgd_model
as
select
idx,
array_avg(u_rank) as Pu,
array_avg(m_rank) as Qi,
avg(u_bias) as Bu,
avg(m_bias) as Bi
from (
select
train_mf_sgd(userid, movieid, rating, '-factor ${factor} -mu${mu} -iter ${iters}') as (idx, u_rank, m_rank, u_bias, m_bias) from training ) t group by idx;  ### Note Hivemall also provides train_mf_adagrad for training using AdaGrad. -help option shows a complete list of hyperparameters. # Predict select t2.actual, mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi,${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx);


# Evaluate (computes MAE and RMSE)

select
mae(predicted, actual) as mae,
rmse(predicted, actual) as rmse
from (
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted from ( select t1.userid, t1.movieid, t1.rating as actual, p1.Pu, p1.Bu from testing t1 LEFT OUTER JOIN sgd_model p1 ON (t1.userid = p1.idx) ) t2 LEFT OUTER JOIN sgd_model p2 ON (t2.movieid = p2.idx) ) t;  MAE RMSE 0.6728969407733578 0.8584162122694449 # Item Recommendation Recommend top-k movies that a user have not ever seen. set hivevar:userid=1; set hivevar:topk=5; select t1.movieid, mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi,${mu}) as predicted
from (
select
idx movieid,
Qi,
Bi
from
sgd_model p
where
p.idx NOT IN
(select movieid from training where userid=${userid}) ) t1 CROSS JOIN ( select Pu, Bu from sgd_model where idx =${userid}
) t2
order by
predicted DESC
limit \${topk};

movieid predicted
318 4.8051853
2503 4.788541
53 4.7518783
904 4.7463417
953 4.732769