Skip to content
Snippets Groups Projects
Commit c66fb035 authored by Erick Lavoie's avatar Erick Lavoie
Browse files

Updated for 2022 version of the course

parent c47ce550
No related branches found
No related tags found
No related merge requests found
package test.predict
import org.scalatest._
import funsuite._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Logger
import org.apache.log4j.Level
import shared.predictions._
import tests.shared.helpers._
import ujson._
class kNNTests extends AnyFunSuite with BeforeAndAfterAll {
val separator = "\t"
var spark : org.apache.spark.sql.SparkSession = _
val train2Path = "data/ml-100k/u2.base"
val test2Path = "data/ml-100k/u2.test"
var train2 : Array[shared.predictions.Rating] = null
var test2 : Array[shared.predictions.Rating] = null
var adjustedCosine : Map[Int, Map[Int, Double]] = null
override def beforeAll {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
spark = SparkSession.builder()
.master("local[1]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
// For these questions, train and test are collected in a scala Array
// to not depend on Spark
train2 = load(spark, train2Path, separator).collect()
test2 = load(spark, test2Path, separator).collect()
}
// All the functions definitions for the tests below (and the tests in other suites)
// should be in a single library, 'src/main/scala/shared/predictions.scala'.
// Provide tests to show how to call your code to do the following tasks.
// Ensure you use the same function calls to produce the JSON outputs in
// src/main/scala/predict/Baseline.scala.
// Add assertions with the answer you expect from your code, up to the 4th
// decimal after the (floating) point, on data/ml-100k/u2.base (as loaded above).
test("kNN predictor with k=10") {
// Create predictor on train2
// Similarity between user 1 and itself
assert(within(1.0, 0.0, 0.0001))
// Similarity between user 1 and 864
assert(within(1.0, 0.0, 0.0001))
// Similarity between user 1 and 886
assert(within(1.0, 0.0, 0.0001))
// Prediction user 1 and item 1
assert(within(1.0, 0.0, 0.0001))
// MAE on test2
assert(within(1.0, 0.0, 0.0001))
}
test("kNN Mae") {
// Compute MAE for k around the baseline MAE
// Ensure the MAEs are indeed lower/higher than baseline
assert(1.0 < 0.0)
}
}
package test.recommend
import org.scalatest._
import funsuite._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Logger
import org.apache.log4j.Level
import shared.predictions._
import tests.shared.helpers._
import ujson._
class RecommenderTests extends AnyFunSuite with BeforeAndAfterAll {
val separator = "\t"
var spark : org.apache.spark.sql.SparkSession = _
val dataPath = "data/ml-100k/u.data"
val personalPath = "data/personal.csv"
var data : Array[shared.predictions.Rating] = null
var personal : Array[shared.predictions.Rating] = null
var train : Array[shared.predictions.Rating] = null
var predictor : (Int, Int) => Double = null
override def beforeAll {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
spark = SparkSession.builder()
.master("local[1]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
data = load(spark, dataPath, separator).collect()
println("Loading personal data from: " + personalPath)
val personalFile = spark.sparkContext.textFile(personalPath)
personal = personalFile.map(l => {
val cols = l.split(",").map(_.trim)
if (cols(0) == "id")
Rating(944,0,0.0)
else
if (cols.length < 3)
Rating(944, cols(0).toInt, 0.0)
else
Rating(944, cols(0).toInt, cols(2).toDouble)
}).filter(r => r.rating != 0).collect()
// TODO: Create predictor
}
// All the functions definitions for the tests below (and the tests in other suites)
// should be in a single library, 'src/main/scala/shared/predictions.scala'.
//
test("Prediction for user 1 of item 1") {
assert(within(1.0, 0.0, 0.0001))
}
test("Top 3 recommendations for user 944") {
val recommendations = List((1,0.0), (2,0.0), (3,0.0))
assert(recommendations(0)._1 == 4)
assert(within(recommendations(0)._2, 5.0, 0.0001))
// Idem recommendation 2 and 3
}
}
package tests.shared
package object helpers {
def within(actual :Double, expected :Double, interval :Double) : Boolean = {
return actual >= (expected - interval) && actual <= (expected + interval)
}
}
test.sh 0 → 100755
#!/usr/bin/env bash
# If your default java install does not work, explicitly
# provide the path to the JDK 1.8 installation. On OSX
# with homebrew:
# export JAVA_HOME=/usr/local/Cellar/openjdk@8/1.8.0+282; ./test.sh
export JAVA_OPTS="-Xmx8G";
RUN=./logs/test-$(date "+%Y-%m-%d-%H:%M:%S")-$(hostname)
mkdir -p $RUN
LOGS=$RUN/log.txt
sbt "testOnly test.AllTests" 2>&1 >>$LOGS
#!/usr/bin/env bash
# If your default java install does not work, explicitly
# provide the path to the JDK 1.8 installation. On OSX
# with homebrew:
# export JAVA_HOME=/usr/local/Cellar/openjdk@8/1.8.0+282; ./run.sh
export JAVA_OPTS="-Xmx8G";
RUN=./logs/timecluster-$(date "+%Y-%m-%d-%H:%M:%S")-$(hostname)
mkdir -p $RUN
LOGS=$RUN/log.txt
source ./config.sh
echo "------------------- DISTRIBUTED ---------------------" >> $LOGS
sbt assembly
# 1 Executor
spark-submit --class distributed.DistributedBaseline --master $SPARKMASTER --num-executors 1 target/scala-2.11/m1_yourid-assembly-1.0.jar --train $ML25Mr2train --test $ML25Mr2test --separator , --json $RUN/distributed-25m-1.json --num_measurements 3 2>&1 >>$LOGS
# 4 Executors
spark-submit --class distributed.DistributedBaseline --master $SPARKMASTER --num-executors 4 target/scala-2.11/m1_yourid-assembly-1.0.jar --train $ML25Mr2train --test $ML25Mr2test --separator , --json $RUN/distributed-25m-4.json --num_measurements 3 2>&1 >>$LOGS
#!/usr/bin/env bash
# If your default java install does not work, explicitly
# provide the path to the JDK 1.8 installation. On OSX
# with homebrew:
# export JAVA_HOME=/usr/local/Cellar/openjdk@8/1.8.0+282;
export JAVA_OPTS="-Xmx8G";
RUN=./logs/timeOthers-$(date "+%Y-%m-%d-%H:%M:%S")-$(hostname)
mkdir -p $RUN
LOGS=$RUN/log.txt
echo "------------------- BASELINE ---------------------" >> $LOGS
sbt "runMain predict.Baseline --train data/ml-100k/u2.base --test data/ml-100k/u2.test --json $RUN/baseline-100k.json --num_measurements 3" 2>&1 >>$LOGS
echo "------------------- DISTRIBUTED ---------------------" >> $LOGS
sbt "runMain predict.Baseline --train data/ml-25m/r2.train --test data/ml-25m/r2.test --separator , --json $RUN/baseline-25m.json --num_measurements 3" 2>&1 >> $LOGS
sbt "runMain distributed.DistributedBaseline --train data/ml-25m/r2.train --test data/ml-25m/r2.test --separator , --json $RUN/distributed-25m-1.json --num_measurements 3 --master local[1]" 2>&1 >>$LOGS
sbt "runMain distributed.DistributedBaseline --train data/ml-25m/r2.train --test data/ml-25m/r2.test --separator , --json $RUN/distributed-25m-4.json --num_measurements 3 --master local[4]" 2>&1 >>$LOGS
#!/usr/bin/env bash
# If your default java install does not work, explicitly
# provide the path to the JDK 1.8 installation. On OSX
# with homebrew:
# export JAVA_HOME=/usr/local/Cellar/openjdk@8/1.8.0+282;
export JAVA_OPTS="-Xmx8G";
RUN=./logs/timetrials-$(date "+%Y-%m-%d-%H:%M:%S")-$(hostname)
mkdir -p $RUN
LOGS=$RUN/log.txt
echo "------------------- KNN -----------------------------" >> $LOGS
sbt "runMain predict.kNN --train data/ml-100k/u2.base --test data/ml-100k/u2.test --json $RUN/knn-100k.json --num_measurements 3" 2>&1 >>$LOGS
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment