diff --git a/algorithms b/algorithms new file mode 160000 index 0000000..abaa657 --- /dev/null +++ b/algorithms @@ -0,0 +1 @@ +Subproject commit abaa657a91e97d18a1cb2be39dcce63e89ab44c7 diff --git a/data/arch.txt b/data/arch.txt new file mode 100644 index 0000000..066c5ed --- /dev/null +++ b/data/arch.txt @@ -0,0 +1 @@ +burial,dead,body,indians,feet,bones,dr,time,grave,place,house,timbers,work,hogán,houses,doorway,na,ia,small,ditto,tsa,sá,pa,zuñi,kiva,pueblo,village,house,stone,wall,walls,built,omaha \ No newline at end of file diff --git a/data/astronomy.txt b/data/astronomy.txt new file mode 100644 index 0000000..c6f8975 --- /dev/null +++ b/data/astronomy.txt @@ -0,0 +1 @@ +star,stars,situated,culminates,color,constellation,line,head,sun,day,moon,year,time,stars,month,sun,earth,page,stars,miles,fig,days,star,power \ No newline at end of file diff --git a/data/phil.txt b/data/phil.txt new file mode 100644 index 0000000..08e3c10 --- /dev/null +++ b/data/phil.txt @@ -0,0 +1 @@ +philosophy,religion,knowledge,nature,science,experience,world,real,nature,substance,reality,individual,actual,distinct,human,mind,plato,principle,intellect,gutenberg,project,prior,gods \ No newline at end of file diff --git a/data/religion.txt b/data/religion.txt new file mode 100644 index 0000000..90b611d --- /dev/null +++ b/data/religion.txt @@ -0,0 +1 @@ +christ spirit heart praise heaven hell glory jesus luke mary holy scripture sin virtue lord thou thy great god faith man disciple psalm david king isreal \ No newline at end of file diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..2acdbb6 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,god,life,word,many,man,world,himself,name,words,things,day,good,death,thing,never,done,nothing,though,right,again,against,still,three,question,called,reason, diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..6ccf404 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,5 +1,6 @@ require_relative 'predictor' + class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called # before the predict() method is called. @@ -7,16 +8,75 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} + + @all_books.each do |category, books| + @data[category] = {} + books.each do |filename, tokens| + tokens = tokens.select {|token| good_token?(token)} + sorted_token_array = tokens.inject(Hash.new(0)) {|k, v| k[v] +=1 ; k }.sort_by {|k,v| v}.reverse.slice(0..100).flatten + @data[category] = @data[category].merge(Hash[*sorted_token_array]) + # @data[category][:books] += 1 + end + end + puts @data end - # Public: Predicts category. - # - # tokens - A list of tokens (words). - # - # Returns a category. - def predict(tokens) - # Always predict astronomy, for now. - :astronomy +# Public: Predicts category. +# +# tokens - A list of tokens (words). +# +# Returns a category. + + +def predict(tokens) + + # philosophy = %w(philosophy knowledge nature science experience world real nature substance reality individual actual distinct human mind plato principle intellect gutenberg project prior) + # religion = %w(christ passion spirit heart praise heaven hell glory jesus luke mary holy scripture virtue lord thou thy great god faith man disciple psalm david king drink brazen israel psalms love men flesh sacrament words tithe paul john miracle) + # astronomy = %w(star stars situated culminates color constellation line head sun day moon year time stars month sun earth page stars miles fig days star power) + # archeology = %w(burial dead body indians feet bones dr time grave place house timbers work hogán houses doorway na ia small ditto tsa sá pa zuñi kiva pueblo village house stone wall walls built omaha) + temp_array = [] + correct_category = nil + result_array = [] + + philosophy = @data[:philosophy] + archeology = @data[:archeology] + religion = @data[:religion] + astronomy = @data[:astronomy] + + + + categories = [archeology, astronomy, philosophy, religion] + categories.each do |category| + result_array = [] + tokens[300..-100].each do |token| + if category[token] + result_array << true + end + end + correct_category = category if result_array.count(true) > temp_array.count(true) + temp_array = result_array if result_array.count(true) > temp_array.count(true) + puts result_array.count(true) end -end + # categories.each do |category| + # # temp_array = result_array + # result_array = [] + # category.each do |word| + # result = tokens.include?(word) + # result_array << result + # correct_category = category if result_array.count(true) > temp_array.count(true) + # temp_array = result_array if result_array.count(true) > temp_array.count(true) + # end + + # puts result_array.count(true) + # end + + return :philosophy if correct_category == philosophy + return :religion if correct_category == religion + return :astronomy if correct_category == astronomy + return :archeology if correct_category == archeology + # Always predict astronomy, for now. + # :astronomy +end +end +# ComplexPredictor.predict(['brutal', 'spirit', 'praise']) \ No newline at end of file diff --git a/lib/predictor.rb b/lib/predictor.rb index d544f81..e4ddbaa 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -98,6 +98,19 @@ def tokenize(string) string.split(/\W+/).map(&:downcase) # Split by non-words end + # def keyword_density(dataset) + # CATEGORIES.each do |category| + # books[category] = {} + # Find.find("data/#{dataset}/#{category}") do |file| + # next if File.directory?(file) + # next if file.split("/").last[0] == "." # Ignore hidden files + + # content = tokenize(File.read(file)) + # books[category] << [file, content] + # end + # end + # end + # Internal: Load books from files. # # dataset - The dataset to use: sample, training, test.