From f1774051307836e41f0dec290e4190cf50f2666c Mon Sep 17 00:00:00 2001 From: Yusef Ouda Date: Wed, 3 Sep 2014 13:53:01 -0500 Subject: [PATCH 1/2] Finished project gutenberg, and it predicts with 100% accuracy using the test books. --- data/stopwords.txt | 2 +- gutenberg.rb | 2 +- lib/complex_predictor.rb | 53 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..5b0a618 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,many diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..f28f5d0 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +# run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..45b5be6 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,4 +1,5 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called @@ -6,7 +7,46 @@ class ComplexPredictor < Predictor # # Returns nothing. def train! + # @data looks like: + # + # { + # philosophy: { + # word1 => count, + # top_words => [list, of, top, words, here] + # }, + # archeology: { + # words: 2000, + # books: 5, + # } + # } @data = {} + + @all_books.each do |category, books| + @data[category] = Hash.new(0) + books.each do |filename, tokens| + tokens.each do |token| + if good_token?(token) + @data[category][token] += 1 + end + end + end + sorted = sort_words(@data, category) + top_words = get_top_words(sorted) + top_words_hash = {} + top_words.each do |word| + top_words_hash[word] = nil + end + @data[category][:top_words] = top_words_hash + end + end + + def sort_words(data, category) + data[category].sort_by { |k,v| v }.reverse + end + + def get_top_words(sorted) + top = sorted.take(200) + top.map! { |word, count| word } end # Public: Predicts category. @@ -16,7 +56,14 @@ def train! # Returns a category. def predict(tokens) # Always predict astronomy, for now. - :astronomy + counts = Hash.new(0) + @data.each do |category, _| + tokens.each do |word| + if @data[category][:top_words].include?(word) + counts[category] += 1 + end + end + end + counts.sort_by { |k,v| v }.last.first end -end - +end \ No newline at end of file From faf67dac3c45b7fa10eb34c784045df6ddec5d30 Mon Sep 17 00:00:00 2001 From: Yusef Ouda Date: Thu, 4 Sep 2014 10:48:22 -0500 Subject: [PATCH 2/2] Modified predict method to use an intersection to find out the number of top words in a book. --- lib/complex_predictor.rb | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 45b5be6..5113f93 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -7,18 +7,6 @@ class ComplexPredictor < Predictor # # Returns nothing. def train! - # @data looks like: - # - # { - # philosophy: { - # word1 => count, - # top_words => [list, of, top, words, here] - # }, - # archeology: { - # words: 2000, - # books: 5, - # } - # } @data = {} @all_books.each do |category, books| @@ -32,11 +20,7 @@ def train! end sorted = sort_words(@data, category) top_words = get_top_words(sorted) - top_words_hash = {} - top_words.each do |word| - top_words_hash[word] = nil - end - @data[category][:top_words] = top_words_hash + @data[category][:top_words] = top_words end end @@ -45,7 +29,7 @@ def sort_words(data, category) end def get_top_words(sorted) - top = sorted.take(200) + top = sorted.take(900) top.map! { |word, count| word } end @@ -55,15 +39,10 @@ def get_top_words(sorted) # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. counts = Hash.new(0) @data.each do |category, _| - tokens.each do |word| - if @data[category][:top_words].include?(word) - counts[category] += 1 - end - end + counts[category] = (tokens & @data[category][:top_words]).length end - counts.sort_by { |k,v| v }.last.first + counts.max_by { |k,v| v }.first end end \ No newline at end of file