E2006

http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf

Data preparation

$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2
scala> :paste
val rawTrainDf = spark.read.format("libsvm").load("E2006.train.bz2")

val (max, min) = rawTrainDf.select(max($"label"), min($"label")).collect.map {
  case Row(max: Double, min: Double) => (max, min)
}

val trainDf = rawTrainDf.select(
    // `label` must be [0.0, 1.0]
    rescale($"label", lit(min), lit(max).as("label"),
    $"features"
  )

scala> trainDf.printSchema
root
 |-- label: float (nullable = true)
 |-- features: vector (nullable = true)

scala> :paste
val testDf = spark.read.format("libsvm").load("E2006.test.bz2")
  .select(rowid(), rescale($"label", lit(min), lit(max)).as("label"), $"features")
  .explode_vector($"features")
  .select($"rowid", $"label".as("target"), $"feature", $"weight".as("value"))
  .cache

scala> df.printSchema
root
 |-- rowid: string (nullable = true)
 |-- target: float (nullable = true)
 |-- feature: string (nullable = true)
 |-- value: double (nullable = true)

Tutorials

[AROWe2]

Training

scala> :paste
val modelDf = trainDf
  .train_arowe2_regr(append_bias($"features"), $"label")
  .groupBy("feature").avg("weight")
  .toDF("feature", "weight")
  .cache

Test

scala> :paste
val predictDf = testDf
  .join(modelDf, testDf("feature") === modelDf("feature"), "LEFT_OUTER")
  .select($"rowid", ($"weight" * $"value").as("value"))
  .groupBy("rowid").sum("value")
  .select($"rowid", sigmoid($"sum(value)").as("predicted"))

Evaluation

scala> :paste
predictDf
  .join(testDf, predictDf("rowid").as("id") === testDf("rowid"), "INNER")
  .groupBy().avg("target", "predicted")
  .show()

+------------------+------------------+
|       avg(target)|    avg(predicted)|
+------------------+------------------+
|0.5489154884487879|0.6030108853227014|
+------------------+------------------+

results matching ""

    No results matching ""