-
Notifications
You must be signed in to change notification settings - Fork 0
/
README
76 lines (55 loc) · 2 KB
/
README
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# ////////////////// #
# /// maxmodelr /// #
# //////////////// #
# SET UP
# ... by cloning the repository
$ git clone https://github.com/dh-thesis/maxmodelr.git
$ cd maxmodelr
# REQUIREMENTS
--> check if you fullfill the requirements listed
--> at https://github.com/herreio/maxplanckr
# PUBLICATION SEARCH
# start R console
$ R
# if it's the first time, packrat will install itself
# after that simply run the following command to set up R environment
> packrat::restore()
# this will take same time, grab yourself a coffee...
# load package
> devtools::load_all()
# load topic coupling data (k = number of topics)
> couple <- load_couple(k=25)
> format(object.size(couple), units="MB")
# [1] "541 Mb" <-- be aware of this memory usage, you need some free RAM!
# set search query
> query <- "information retrieval"
# choose topic model
> models <- c("lda_all","lda_mpi","lda_pers","btm_all")
> model <- sample(models, 1)
# get n most probable items with same topic as query
> n = 20
> search_topic_items(couple, query=query, model=model, n=n)
# take c most probable items with same topic as query
# and compute js-divergence between theta of items and query
# return the first n items
> c = 100
> search_dist_items(couple, query=query, model=model, n=n, c=c)
# take all items with same topic as query
# and compute js-divergence between theta of items and query
# return the first n items
# !!
# !! depending on the number of topics in the model, this means comparing lots of vectors
# !! the lower the number of topics, the more publications are in the same topic
# !! and need to be compared. it is suggested not to use this function with the
# !! example coupling data provided in this repository with k = 25.
# !! instead create coupling data with k >= 100 (see below)
# !!
> search_topic_items_dist(couple, query=query, model=model, n=n)
# DATA PREPARATION
# create models
$ exec/runbtm > btm.log 2>&1
$ exec/runlda > lda.log 2>&1
# infer titles
$ exec/infer > infer.log 2>&1
# couple titles and topics
$ exec/couple