Scala for Multicore

UPMARC Multicore Computing Summer School 2011

Philipp Haller, EPFL and Stanford University

Code Example: Parallel Collections

Using parallel collections to manipulate a large piece of data from Wikipedia. Suitable for entry into Scala's interpreter, feel free to play around! Click here to download the wikipedia data file, titles-sorted-small.txt

NOTE:This example requires Scala 2.9.


import scala.collection.parallel.immutable.ParVector
import scala.collection.parallel.mutable.ParArray
import scala.collection.immutable.Vector

object ParCollsExample {

  def readWikipediaTitles(): ParArray[Array[String]] = {
    val titles = scala.io.Source.fromFile("titles-sorted-small.txt").getLines()
    var tv = ParArray[Array[String]]()

    for (t <- titles.drop(300000).take(10000)) {
      tv = tv :+ (t.split('_'))
    }

    tv
  }

  def time(block: => Unit) {
    val start = System.currentTimeMillis()
    block
    val end = System.currentTimeMillis()
    println("time: " + (end - start))
  }

  def mostOften(sq: scala.collection.GenSeq[Array[String]]) {
    val words = sq.flatten.filter(w => !stopWords.contains(w))

    val counts = for (t <- sq; w <- t) yield {
      val c = words.count(_ == w)
      (c, w, t.mkString(" "))
    }

    val mostOften = counts.reduce((p1, p2) => if (p1._1 > p2._1) p1 else p2)
    println(mostOften)
  }

  val stopWords = List("the", "of", "from", "a", "an", "by", "in", "All")

  def main(args: Array[String]) {
    val tv = readWikipediaTitles()
    val tvseq = tv.seq
    println("sequentially:")
    time {
      val words = tvseq.flatten.filter(w => !stopWords.contains(w))

      val counts = for (title <- tvseq; w <- title) yield {
        val c = words.count(_ == w)
        (c, w, title.mkString(" "))
      }

      val mostOften = counts.reduceLeft((p1, p2) => if (p1._1 > p2._1) p1 else p2)
      println(mostOften)

      val someTitles = counts.filter(tuple => tuple._2 == mostOften._2)
      println(someTitles.take(20))
    }

    println("in parallel:")
    time {
      val words = tv.flatten.filter(w => !stopWords.contains(w))

      val counts = for (title <- tv; w <- title) yield {
        val c = words.count(_ == w)
        (c, w, title.mkString(" "))
      }

      val mostOften = counts.reduce((p1, p2) => if (p1._1 > p2._1) p1 else p2)
      println(mostOften)

      val someTitles = counts.filter(tuple => tuple._2 == mostOften._2)
      println(someTitles.take(20))
    }
  }
}