diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..5b0a618 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,many diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..f28f5d0 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +# run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..5113f93 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,4 +1,5 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called @@ -7,6 +8,29 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} + + @all_books.each do |category, books| + @data[category] = Hash.new(0) + books.each do |filename, tokens| + tokens.each do |token| + if good_token?(token) + @data[category][token] += 1 + end + end + end + sorted = sort_words(@data, category) + top_words = get_top_words(sorted) + @data[category][:top_words] = top_words + end + end + + def sort_words(data, category) + data[category].sort_by { |k,v| v }.reverse + end + + def get_top_words(sorted) + top = sorted.take(900) + top.map! { |word, count| word } end # Public: Predicts category. @@ -15,8 +39,10 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + counts = Hash.new(0) + @data.each do |category, _| + counts[category] = (tokens & @data[category][:top_words]).length + end + counts.max_by { |k,v| v }.first end -end - +end \ No newline at end of file