Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- override def fit(dataset: Dataset[_]): CountVectorizerModel = {
- transformSchema(dataset.schema, logging = true)
- if (($(minDF) >= 1.0 && $(maxDF) >= 1.0) || ($(minDF) < 1.0 && $(maxDF) < 1.0)) {
- require($(maxDF) >= $(minDF), "maxDF must be >= minDF.")
- }
- val vocSize = $(vocabSize)
- val input = dataset.select($(inputCol)).rdd.map(_.getSeq[String](0))
- val countingRequired = $(minDF) < 1.0 || $(maxDF) < 1.0
- val maybeInputSize = if (countingRequired) {
- if (dataset.storageLevel == StorageLevel.NONE) {
- input.persist(StorageLevel.MEMORY_AND_DISK)
- }
- Some(input.count)
- } else {
- None
- }
- val minDf = if ($(minDF) >= 1.0) {
- $(minDF)
- } else {
- $(minDF) * maybeInputSize.get
- }
- val maxDf = if ($(maxDF) >= 1.0) {
- $(maxDF)
- } else {
- $(maxDF) * maybeInputSize.get
- }
- require(maxDf >= minDf, "maxDF must be >= minDF.")
- val allWordCounts = input.flatMap { case (tokens) => val wc =
- new OpenHashMap[String, Long] tokens.foreach
- { w => wc.changeValue(w, 1L, _ + 1L) } wc.map
- { case (word, count) => (word, (count, 1)) } }
- .reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2)}
- val filteringRequired = isSet(minDF) || isSet(maxDF)
- val maybeFilteredWordCounts = if (filteringRequired) {
- allWordCounts.filter { case (_, (_, df)) => df >= minDf && df <= maxDf }
- } else {
- allWordCounts
- }
- val wordCounts = maybeFilteredWordCounts
- .map { case (word, (count, _)) => (word, count) }
- .persist(StorageLevel.MEMORY_AND_DISK)
- val fullVocabSize = wordCounts.count()
- val vocab = wordCounts
- .top(math.min(fullVocabSize, vocSize).toInt)(Ordering.by(_._2))
- .map(_._1)
- if (input.getStorageLevel != StorageLevel.NONE) {
- input.unpersist()
- }
- wordCounts.unpersist()
- require(vocab.length > 0, "The vocabulary size should be > 0. Lower minDF as necessary.")
- copyValues(new CountVectorizerModel(uid, vocab).setParent(this))
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement