Step by step:

import org.apache.spark.ml.feature.StopWordsRemover

import org.apache.spark.sql.functions.split

val lines = sc.textFile("hdfs://localhost:9000/Peter").map(.replaceAll(raw"^A-Za-z0-9\s+", "").trim.toLowerCase).toDF("line")_

val words = lines.select(split($"line", " ").alias("words"))

create a remover to remove all the "StopWords" from val words

val remover = new StopWordsRemover().setInputCol("words").setOutputCol("filtered")

val counts = noStopWords.select(explode($"filtered")).map(word =>(word, 1)).reduceByKey(+)

val mostCommon = counts.map(p => (p._2, p._1)).sortByKey(false, 1)

mostCommon.saveAsTextFile("Peter_Final")

hdfs dfs -cat hdfs://localhost:9000/user/root/Peter_Final/part-00000 > Peter_Final.txt

head -n 100 Peter_Final.txt

to fix: numbers are still counted

to do: loop all the Bible books and merge them to a single text file and run the solution again

results matching ""