Philipp Haller, EPFL and Stanford University
Using parallel collections to manipulate a large piece of data from Wikipedia. Suitable for entry into Scala's interpreter, feel free to play around! Click here to download the wikipedia data file, titles-sorted-small.txt
NOTE:This example requires Scala 2.9.
import scala.collection.parallel.immutable.ParVector import scala.collection.parallel.mutable.ParArray import scala.collection.immutable.Vector object ParCollsExample { def readWikipediaTitles(): ParArray[Array[String]] = { val titles = scala.io.Source.fromFile("titles-sorted-small.txt").getLines() var tv = ParArray[Array[String]]() for (t <- titles.drop(300000).take(10000)) { tv = tv :+ (t.split('_')) } tv } def time(block: => Unit) { val start = System.currentTimeMillis() block val end = System.currentTimeMillis() println("time: " + (end - start)) } def mostOften(sq: scala.collection.GenSeq[Array[String]]) { val words = sq.flatten.filter(w => !stopWords.contains(w)) val counts = for (t <- sq; w <- t) yield { val c = words.count(_ == w) (c, w, t.mkString(" ")) } val mostOften = counts.reduce((p1, p2) => if (p1._1 > p2._1) p1 else p2) println(mostOften) } val stopWords = List("the", "of", "from", "a", "an", "by", "in", "All") def main(args: Array[String]) { val tv = readWikipediaTitles() val tvseq = tv.seq println("sequentially:") time { val words = tvseq.flatten.filter(w => !stopWords.contains(w)) val counts = for (title <- tvseq; w <- title) yield { val c = words.count(_ == w) (c, w, title.mkString(" ")) } val mostOften = counts.reduceLeft((p1, p2) => if (p1._1 > p2._1) p1 else p2) println(mostOften) val someTitles = counts.filter(tuple => tuple._2 == mostOften._2) println(someTitles.take(20)) } println("in parallel:") time { val words = tv.flatten.filter(w => !stopWords.contains(w)) val counts = for (title <- tv; w <- title) yield { val c = words.count(_ == w) (c, w, title.mkString(" ")) } val mostOften = counts.reduce((p1, p2) => if (p1._1 > p2._1) p1 else p2) println(mostOften) val someTitles = counts.filter(tuple => tuple._2 == mostOften._2) println(someTitles.take(20)) } } }