-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibUpdate.m
84 lines (70 loc) · 3.07 KB
/
bibUpdate.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
function [bibStruct]= bibUpdate(bibStruct, wordBag, nGrams, ...
topicLabels, fileNames, options)
% BIBUPDATE Updates existing bibStruct with generated keywords.
% [bibStruct] = bibUpdate(bibStruct, wordBag, nGrams, topicLabels,
% fileNames, options)
%
% CURRENTLY, PREVIOUS KEYWORDS IN THE BIBTEX INFO ARE OVERWRITTEN.
%
% Adds keywords to existing bibliographic information stored in a struct.
% Keywords are based on word and nGram frequency information as well as
% topic modelling.
%
% ====INPUT=====
% bibStruct struct Struct containing bib. information
% wordBag bagOfWords bag-of-words model of documents
% nGrams bagOfNgrams bag-of-nGrams model of documents
% topicLabels categorical Contains the most probable
% topics of each document
% fileNames cell array List of file locations
% nTopWords integer Number of most frequent keywords to be
% passed to keywords
% -options-
% fullFile logical true: match entries by full file path
% false: match by file name only
%
% ====OUTPUT====
% bibStruct struct Struct with updated bibliography,
% includes keywords extracted from
% wordBag, nGram models and topic
% modelling
arguments
bibStruct struct
wordBag bagOfWords
nGrams bagOfNgrams
topicLabels categorical
fileNames cell
options.topKWords {mustBeInteger} = 4
options.fullFile = false
end
% Extract keywords for each document
[~, keywordIdx] = maxk(wordBag.Counts, options.topKWords, 2);
wordList = wordBag.Vocabulary(keywordIdx);
[~, nGramIdx] =maxk(nGrams.Counts, options.topKWords, 2);
NGramVocabulary = join(nGrams.Ngrams, " ", 2);
nGramList = NGramVocabulary(nGramIdx);
topicList = join(string(topicLabels), ", ", 2);
delimSpace = ", ";
% List of keywords, combined as a single string per document
keywords = join(wordList, delimSpace, 2) + delimSpace + ...
join(nGramList, delimSpace, 2) + delimSpace + topicList;
% Extract file path, independent of drive letter (e.g., ocmpatible with
% changing drive letters
% Mostly useful if the bibtex file was read from a reference manager.
%Perhaps option to only have file name but not path?
structFiles = fullfile([bibStruct.file]);
structFiles = extractAfter(structFiles, ":");
storedFiles = string(fileNames);
storedFiles = extractAfter(storedFiles, ":");
if ~options.fullFile
[~, structFiles, ~] = fileparts(structFiles);
[~, storedFiles, ~] = fileparts(storedFiles);
end
% Matching Indices based on file path and name
[~, matchedIdx] = ismember(structFiles, storedFiles);
% Assigning keywords to struct
matchedKeywords = keywords(matchedIdx(matchedIdx));
%[bibStruct.keywords] = matchedKeywords{:};
% Implement appending keywords instead of verwriting
[bibStruct.keywords] = matchedKeywords{:};
end