libraryOrganizer.m

function bibStruct = libraryOrganizer
%   LIBRARYORGANIZER    Adds machine-generated keywords for pdf documents
%   function bibStruct = libraryOrganizer
%
%   Performs topic modelling on a collection of text files specified via
%   user input. Best suited for academic texts which include a DOI, e.g.
%   research articles. Creates BibTex file with automatically generated
%   keywords for later import to reference managers.
%   Topic Modelling is a way to identifiy common themes across multiple
%   documents based on word frequencies. With the used approach to topic
%   modelling, an individidual document can belong to multiple topics
%   (probabilistic model)
%
%   Topic Modelling Method: Latent Dirichlet Analysis (MATLAB fitlda)
%
%   -Choose a folder with text files
%   -Optionally: Choose exisitng BibTex files
%   -prepare text and perform topic modelling
%   -visualize documents and topics
%   -create output BibTex file
%
%   ====INPUT=====
%   User input via GUI elements
% 
%   ====OUTPUT====
%   bibStruct       struct          Contains bibliographic information for
%                                   user-defined text files and
%                                   automatically generated keywords.
%   author: Joshua Pepe Woller
% 

% Default number of topics to be extracted in topic modelling
nTopicDef = 5;

%% READ PDF FILES

% Depth of Search, specify level down to which files should be searched
searchDepth = questdlg({"Please specify the search depth for retrieving " + ...
    "pdf files."; "Full: Look into folder and all subfolders (and their subfolders...)" + ...
    " (e.g., the fully nested folder structure)"; ...
    "First: Only consider files directly in the folder, ";
    "Second: Look into folder and direct subfolders for files " + ...
    "as well as direct subfolders"}, "Search depth setting", ...
    "full", "first", "second", "full");

% Retrieve files from specified folder structure at given search depth
% fileNames: list of file names
% nFiles:    Get number of files found
[~, fileNames, nFiles] = fileSearch("searchDepth", searchDepth);

%% RETRIEVE RAW TEXT AND BIBLIOGRAPHIC INFORMATION

% Optional input of existing BibTex formatted file that describes the pdf
% files in the folder (e.g., generated by a reference manager program)
% Should be in .bib or .txt format.

ownBib = questdlg({"Do you want to choose an existing BibTex file?"; ...
    "If not, information will be retrieved from an online service " + ...
    "(crossref.org)."}, ...
    "Choose BibTex file", "Yes", "No", "No");

% Either try to read info from existing file, or get online info
switch ownBib
    case "Yes"
    [bibFile, bibPath] = uigetfile(["*.bib"; "*.txt"] , ...
        "OPTIONAL: Choose local BibTex file, if available (.bib/.txt).");
    % If no file was chosen, uigetfile creates a 0,0 value pair.
    % If a location for an existing BibTex file was chosen it is parsed to
    % struct.
    if bibFile ~= 0
        bibFile    = fullfile(bibPath, bibFile);
        % Extract text from pdf files, but do not download additional info
        textStruct = getBibInfo(nFiles, fileNames, "downloadBibTex", false);
        % Convert .bib or .txt file to struct
        bibStruct  = parseBibTex(bibFile);    
        % Fill empty fields with <missing> values to allow for better indexing.
        % Otherwise, empty fields are simply omitted if contents of struct are
        % listed (e.g., via accessing the DOI field with bibStruct.doi)
        fieldNames = fieldnames(bibStruct);        
        for fieldIdx = 1:length(fieldNames)
            currField  = fieldNames(fieldIdx);
            currField  = currField{:};
            % Logical indexing to find empty fields
            emptyIndex = arrayfun(@(bibStruct) ...
                isempty(bibStruct.(currField)),bibStruct);
            [bibStruct(emptyIndex).(currField)] = deal(missing);
        end
    % Find BibTex entries that have the same DOI or file location as
    % extracted from pdf files.
    inBibDoi   = ismember([bibStruct.doi], [textStruct.doi]); 
    % Remove escape character "\" present in MS windows style paths.
    fileLoc    = strrep([bibStruct.file], "\\", "\");
    inBibFile  = ismember(fileLoc, [textStruct.file]); 
    % Set aside BibTex info of documents that were not matched to files in
    % the folder structure. Later, this gets again written to the BibTex
    % file.
    origBibTex = bibStruct(~inBibDoi);
    % Match parsed pdf files to their entries in user-defined BibTex by
    % their DOI and file location
    bibStruct  = bibStruct(inBibDoi|inBibFile);  
    clear fieldNames
    else
        % If no valid file chosen, download BibTex information.
        textStruct = getBibInfo(nFiles, fileNames, "downloadBibTex", true);
        bibStruct = parseBibTex(textStruct);
    end
    clear bibPath bibFile
    case "No"
        % Extract text from pdf files.
        % If no BibTex info was given by user, we try to download it from
        % crossref.org using the DOI extracted from pdf texts.
        textStruct = getBibInfo(nFiles, fileNames, "downloadBibTex", true);
        bibStruct = parseBibTex(textStruct);
end

%% TEXT PREPARATION AND TOKENIZATION

% With many and large texts, topic modelling can be faster if only nouns
% are used. This however leads to weird n-Grams due to the deletion of
% adjectives and verbs between nouns.
tokOption = questdlg({"Use only nouns for tokenization and topic modelling?"; ...
    "Topic Modelling using only nouns from a tokenized document" + ...
    " can be faster, but n-Grams become less comprehensible."; ...
    "'All words' recommended as default."}, ...
    "Tokenization Options", "All words", "Nouns only", "All words");
nounOnly = tokOption == "Nouns only";

% Tokenizing simplifies and unifies word forms, making them suitable for
% algorithmic analysis. Short and infrequent words get deleted.
% For optional arguments, see corresponding documentation.
% textStruct.text contains parsed text of individual pdf files
tokText = preprocessingText([textStruct.text], "NounOnly", nounOnly);

% Bag-of-Words and Bag-of-N-Gram models reduce text to a frequency count
% for subsequent analysis.
[wordBag, nGrams] = wordBagPack(tokText);

% Topic Modelling can pe performed on individual words (e.g., "Neuron") or
% on n-Grams (e.g., the bi-Gram "Neuron doctrine")
% -
% User defined choice of basis for topic modelling. Ask until choice is
% made or program is quit.
while ~exist("textItemType", "var") || isempty(textItemType)
    % Dialog to choose text unit for Topic Modelling; closing the window
    % leads to empty string output
    textItemType = questdlg("Which text units should be used for topic modelling?", ...
         "Topic Modelling Selection", 'words', 'n-Grams', 'words');
    if isempty(textItemType)
        % If no option was chosen, ask again and give opportunity to quit
        % execution
        quitTopic = questdlg("No method was chosen. Quit program?", ...
            "Exit Topic Modelling", "Go back", "Quit", "Go back");

        if quitTopic == "Quit"
            % Give error and exit function execution
            errordlg("User did not specify text unit for topic modelling. " + ...
                "Program terminated.");
            return
        end
    end
end


%% TOPIC MODELLING

% GUI to set number of topics (integer); returns cell array
nTopics = inputdlg('How many topics should be extracted?', ...
    'Number of Topics',[1 50], {'5'});
% Unpack cell array
nTopics = nTopics{:};
% Convert to number; if no number present, nTopics is empty
nTopics = str2double(nTopics);
if isempty(nTopics) || ~(nTopics == int64(nTopics))
    % If no valid integer was passed, use default value
    warndlg("'"+nTopics +"'"+ " is not a valid input " + ...
        "for number of topics. " + ...
        "Using default value of "+nTopicDef+".")
    nTopics = nTopicDef;
end

if nTopics >= wordBag.NumDocuments/2
    warning("Less than two documents per topic on average." + ...
        " Consider reducing the number of topics or increasing" + ...
        " the number of documents.")
end
% Perform topic modelling based on user specified text units and number of
% topics.
switch textItemType
    case "words"        
        topicModel = fitlda(wordBag, nTopics,"Verbose",0);
    case "n-Grams"
        % Warn user that nounOnly and n-Grams don't get along
        if exist("nounOnly", "var") && nounOnly
            warndlg("Extracting only nouns not recommended for N-gram based models!")
        end
        topicModel = fitlda(nGrams, nTopics, "Verbose", 0);
end

%% INSPECT AND RENAME TOPICS

% Show word clouds describing all topics; User can define names for the
% topics that later serve as keywords for the Bibtex entries.
% TSNE plot illustrates distance between individual topic clusters.
[~, topicLabels] = plotTopic (topicModel, bibStruct);

%% BIBTEX FILE EXPORT

% Add generated keywords (frequent words, nGrams and associated topics) to
% bibStruct.
bibStruct = bibUpdate(bibStruct, wordBag, nGrams, topicLabels, fileNames);
% Select file to write Bibtex to.
while ~exist("filePath", "var") || ~all(writeFile) %== 0
    [writeFile, writePath] = uiputfile('*.bib', "Choose Location to save " + ...
        "BibTex file", "keywordedLibrary");
    filePath = fullfile(writePath, writeFile);
    if ~all(writeFile)
        % If no file was chosen, ask again; Offer opportunity to quit
        quitTopic = questdlg("No file for saving BibTex specified. Quit program?", ...
            "Exit?", "Go back", "Quit", "Go back");
        if quitTopic == "Quit"
            errordlg("User did not specify BibTex file location. " + ...
                "Program terminated.");
            return
        end
    end
end

% Write keyworded bibStruct to file
writeBibTex(bibStruct, filePath)
% If we received a user-defined BibTex file, we append unmatched entries
% (e.g. where no corresponding files in our folder were found)
% to the new BibTex file so that the initial library is complete again. 
if exist("origBibTex", "var")
    writeBibTex(origBibTex, filePath, "mode", 'a+')
end
% Notify user of success; OK to end function
uiwait(msgbox("Done! BibTex successfully written to: "+filePath+" .", ...
    "Success", "modal"))
end