Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
Cs449 Template M2 2022
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Hugo Manuel Serge Lanfranchi
Cs449 Template M2 2022
Commits
3d100ebc
Commit
3d100ebc
authored
2 years ago
by
hugolan
Browse files
Options
Downloads
Plain Diff
approx
parents
9e0e6c3a
8f051ddb
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/main/scala/shared/predictions.scala
+26
-23
26 additions, 23 deletions
src/main/scala/shared/predictions.scala
with
26 additions
and
23 deletions
src/main/scala/shared/predictions.scala
+
26
−
23
View file @
3d100ebc
...
...
@@ -249,35 +249,41 @@ package object predictions
}
//5
def
distributed_knn_approximate
(
preprocessed_ratings
:
CSCMatrix
[
Double
],
k
:
Int
,
spark_context
:
SparkContext
,
nbPartitions
:
Int
,
replication
:
Int
)
:
Array
[
Int
]
=
{
val
new_ratings
=
new
CSCMatrix
[
Double
](
ratings
.
rows
,
ratings
.
cols
)
//Seq[Set[Int]
users_partition
=
partitionUsers
(
preprocessed_ratings
.
rows
,
nbPartitions
,
replication
)
val
broadcast
=
sc
.
broadcast
(
preprocessed_ratings
.
toDense
)
val
approximate_topk
=
sc
.
parallelize
(
users_partition
).
map
(
partition_iterator
=>
{
def
distributed_knn_approximate
(
preprocessed_ratings
:
DenseMatrix
[
Double
],
k
:
Int
,
spark_context
:
SparkContext
,
nbPartitions
:
Int
,
replication
:
Int
)
:
CSCMatrix
[
Double
]
=
{
val
new_ratings
=
new
CSCMatrix
.
Builder
[
Double
](
rows
=
preprocessed_ratings
.
rows
,
cols
=
preprocessed_ratings
.
cols
)
val
users_partition
=
partitionUsers
(
preprocessed_ratings
.
rows
,
nbPartitions
,
replication
)
val
broadcast
=
spark_context
.
broadcast
(
preprocessed_ratings
)
val
approximate_topk
=
spark_context
.
parallelize
(
users_partition
).
map
(
partition_iterator
=>
{
val
ratings
=
broadcast
.
value
val
partition
=
ratings
(
partition_iterator
,::)
val
similarities
=
partition
*
partition
.
t
val
partition_index
=
partition
.
zipWith
(
Array
[
Int
](
partition
.
rows
)).
toMap
//val all_users = (0 until ratings.rows).toSeq
//val exclude_users =all_users.diff(partition_iterator.toSeq)
//val partition = ratings.delete(exclude_users,Axis._0)
val
slice
=
ratings
(
partition_iterator
.
toSeq
.
sortWith
(
_
<
_
),
::).
toDenseMatrix
val
similarities
=
slice
*
slice
.
t
val
partition_index
=
(
0
until
similarities
.
rows
).
zip
(
partition_iterator
.
toSeq
.
sortWith
(
_
<
_
)).
toMap
//TODO how to integrate indexes in knn
val
topk
=
(
0
until
p
ar
t
iti
on
.
rows
).
toList
.
map
(
u
=>
similarities
(
u
,
::).
t
val
topk
=
(
0
until
simil
ariti
es
.
rows
).
toList
.
map
(
u
=>
similarities
(
u
,
::).
t
.
toArray
.
zip
(
p
.
toSeq
.
sortWith
(
_
<
_
))
.
zip
(
p
artition_iterator
.
toSeq
.
sortWith
(
_
<
_
))
.
sortWith
(
_
.
_1
>
_
.
_1
)
.
slice
(
1
,
k
+
1
)
.
map
(
v
=>
(
u
,
v
.
_2
,
v
.
_1
))).
flatMap
(
x
=>
x
)
//val res = topk.map(x => knn(x,k,similarities))
res
.
map
{
case
(
u
,
v
,
s
)
=>
(
partition_index
(
u
),
v
,
s
)}
topk
.
map
{
case
(
u
,
v
,
s
)
=>
(
partition_index
(
u
),
v
,
s
)}
//val partition_index = partition.zipWith(Array[Int](partition.rows))
//TODO how to integrate indexes in knn
//val sorted_users =partition_iterator.toArray.sorted
//val topk = partition_iterator.map(x => (x,knn(sorted_users.indexOf(x),k,similarities)))
}).
collect
()
//redo knn
//val group = approximate_topk.groupby(_._1).map(x => x._2.map((_._2,_._3)).toArray.sortBy(-_).slice(0,k).map(z => (x._1,z._1,z._2)))
val
group
=
knns
.
flatMap
(
x
=>
x
)
val
group
=
approximate_topk
.
flatMap
(
x
=>
x
)
.
groupBy
(
x
=>
x
.
_1
)
.
map
(
x
=>
x
.
_2
.
map
(
y
=>
(
y
.
_2
,
y
.
_3
))
.
toList
...
...
@@ -290,22 +296,19 @@ def distributed_knn_approximate(preprocessed_ratings : CSCMatrix[Double], k : In
new_ratings
.
add
(
x
.
_1
,
x
.
_2
,
x
.
_3
)
}
return
new_ratings
.
result
()
return
new_ratings
.
result
}
def
knn_with_index
((
global_user
,
local_user
)
:
(
Int
,
Int
),
k
:
Int
,
similarities
:
DenseMatrix
[
Double
])
:
Array
[
Int
]
=
{
//first element is itself so take the tail
return
argtopk
(
similarities
(::,
user
),
k
+
1
).
toArray
.
tail
}
def
distributed_knn_predictor
(
spark_context
:
CSCMatrix
[
Double
],
k
:
Int
,
spark_context
:
SparkContext
)
:
(
Int
,
Int
)
=>
Double
=
{
def
distributed_knn_predictor_approximate
(
ratings
:
CSCMatrix
[
Double
],
k
:
Int
,
spark_context
:
SparkContext
,
nbPartitions
:
Int
,
replication
:
Int
)
:
(
Int
,
Int
)
=>
Double
=
{
var
user_average
=
compute_user_averages
(
ratings
)
var
normalized_ratings_
=
normalized_ratings
(
ratings
,
user_average
)
var
preprocessed_ratings_
=
preprocessed_ratings
(
normalized_ratings_
)
val
similarities_knn
=
distributed_knn_approximate
(
preprocessed_ratings_
,
k
,
spark_context
)
val
similarities_knn
=
distributed_knn_approximate
(
preprocessed_ratings_
,
k
,
spark_context
,
nbPartitions
,
replication
)
val
item_deviations
=
compute_item_deviations
(
ratings
,
normalized_ratings_
,
similarities_knn
)
val
item_deviations
=
compute_item_deviations
(
ratings
,
normalized_ratings_
,
similarities_knn
.
toDense
)
return
(
u
:
Int
,
i
:
Int
)
=>
predict
(
user_average
(
u
),
item_deviations
(
u
,
i
))
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment