Philipp Haller, EPFL and Stanford University
Using parallel collections to manipulate a large piece of data from Wikipedia. Suitable for entry into Scala's interpreter, feel free to play around! Click here to download the wikipedia data file, titles-sorted-small.txt
NOTE:This example requires Scala 2.9.
import scala.collection.parallel.immutable.ParVector
import scala.collection.parallel.mutable.ParArray
import scala.collection.immutable.Vector
object ParCollsExample {
def readWikipediaTitles(): ParArray[Array[String]] = {
val titles = scala.io.Source.fromFile("titles-sorted-small.txt").getLines()
var tv = ParArray[Array[String]]()
for (t <- titles.drop(300000).take(10000)) {
tv = tv :+ (t.split('_'))
}
tv
}
def time(block: => Unit) {
val start = System.currentTimeMillis()
block
val end = System.currentTimeMillis()
println("time: " + (end - start))
}
def mostOften(sq: scala.collection.GenSeq[Array[String]]) {
val words = sq.flatten.filter(w => !stopWords.contains(w))
val counts = for (t <- sq; w <- t) yield {
val c = words.count(_ == w)
(c, w, t.mkString(" "))
}
val mostOften = counts.reduce((p1, p2) => if (p1._1 > p2._1) p1 else p2)
println(mostOften)
}
val stopWords = List("the", "of", "from", "a", "an", "by", "in", "All")
def main(args: Array[String]) {
val tv = readWikipediaTitles()
val tvseq = tv.seq
println("sequentially:")
time {
val words = tvseq.flatten.filter(w => !stopWords.contains(w))
val counts = for (title <- tvseq; w <- title) yield {
val c = words.count(_ == w)
(c, w, title.mkString(" "))
}
val mostOften = counts.reduceLeft((p1, p2) => if (p1._1 > p2._1) p1 else p2)
println(mostOften)
val someTitles = counts.filter(tuple => tuple._2 == mostOften._2)
println(someTitles.take(20))
}
println("in parallel:")
time {
val words = tv.flatten.filter(w => !stopWords.contains(w))
val counts = for (title <- tv; w <- title) yield {
val c = words.count(_ == w)
(c, w, title.mkString(" "))
}
val mostOften = counts.reduce((p1, p2) => if (p1._1 > p2._1) p1 else p2)
println(mostOften)
val someTitles = counts.filter(tuple => tuple._2 == mostOften._2)
println(someTitles.take(20))
}
}
}