Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lectures: Show slide numbers to be removed in automatic unit processing #7350

Merged
merged 14 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package de.tum.in.www1.artemis.service;

import java.io.*;
import java.nio.file.Path;
import java.time.ZonedDateTime;
import java.util.*;

import javax.validation.constraints.NotNull;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
Expand Down Expand Up @@ -38,6 +41,11 @@

private final AttachmentUnitService attachmentUnitService;

private final PDFTextStripper pdfTextStripper = new PDFTextStripper();

// A pdf splitter that should be used to split a file into single pages
private final Splitter pdfSinglePageSplitter = new Splitter();

public LectureUnitProcessingService(SlideSplitterService slideSplitterService, FileService fileService, LectureRepository lectureRepository,
AttachmentUnitService attachmentUnitService) {
this.fileService = fileService;
Expand All @@ -50,15 +58,14 @@
* Split units from given file according to given split information and saves them.
*
* @param lectureUnitInformationDTO The split information
* @param file The file (lecture slide) to be split
* @param fileBytes The byte content of the file (lecture slides) to be split
* @param lecture The lecture that the attachment unit belongs to
* @return The prepared units to be saved
*/
public List<AttachmentUnit> splitAndSaveUnits(LectureUnitInformationDTO lectureUnitInformationDTO, MultipartFile file, Lecture lecture) throws IOException {
public List<AttachmentUnit> splitAndSaveUnits(LectureUnitInformationDTO lectureUnitInformationDTO, byte[] fileBytes, Lecture lecture) throws IOException {

try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); PDDocument document = Loader.loadPDF(file.getBytes())) {
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); PDDocument document = Loader.loadPDF(fileBytes)) {
List<AttachmentUnit> units = new ArrayList<>();
Splitter pdfSplitter = new Splitter();

for (LectureUnitSplitDTO lectureUnit : lectureUnitInformationDTO.units()) {
// make sure output stream doesn't contain old data
Expand All @@ -67,7 +74,7 @@
AttachmentUnit attachmentUnit = new AttachmentUnit();
Attachment attachment = new Attachment();
PDDocumentInformation pdDocumentInformation = new PDDocumentInformation();

Splitter pdfSplitter = new Splitter();
pdfSplitter.setStartPage(lectureUnit.startPage());
pdfSplitter.setEndPage(lectureUnit.endPage());
// split only based on start and end page
Expand Down Expand Up @@ -100,28 +107,60 @@
}
}

/**
* Gets the slides that should be removed by the given keyphrase
*
* @param fileBytes The byte content of the file (lecture slides) to be split
* @param commaSeparatedKeyphrases key phrases that identify slides about to be removed
* @return list of the number of slides that will be removed
*/
public List<Integer> getSlidesToRemoveByKeyphrase(byte[] fileBytes, String commaSeparatedKeyphrases) {
List<Integer> slidesToRemove = new ArrayList<>();
if (commaSeparatedKeyphrases.isEmpty()) {
return slidesToRemove;
}
try (PDDocument document = Loader.loadPDF(fileBytes)) {
List<PDDocument> pages = pdfSinglePageSplitter.split(document);
List<String> keyphrasesList = getKeyphrasesFromString(commaSeparatedKeyphrases);

for (int index = 0; index < pages.size(); index++) {
try (PDDocument currentPage = pages.get(index)) {
String slideText = pdfTextStripper.getText(currentPage);

if (slideContainsKeyphrase(slideText, keyphrasesList)) {
slidesToRemove.add(index);

Check warning on line 131 in src/main/java/de/tum/in/www1/artemis/service/LectureUnitProcessingService.java

View check run for this annotation

Teamscale / teamscale-findings

src/main/java/de/tum/in/www1/artemis/service/LectureUnitProcessingService.java#L131

This method is slightly nested [0]. Consider extracting helper methods or reducing the nesting by using early breaks or returns. [0] https://teamscale.io/findings.html#details/GitHub-ls1intum-Artemis?t=feature%2Flecture%2Fshow-removed-slide-numbers%3AHEAD&id=600005BDFAF82F7EFF2C5ADC022E6359
}
}
}
}
catch (IOException e) {
log.error("Error while retrieving slides to remove from document", e);
throw new InternalServerErrorException("Error while retrieving slides to remove from document");
}
return slidesToRemove;
}

/**
* Removes the slides containing any of the key phrases from the given document.
*
* @param document document to remove slides from
* @param removeSlidesCommaSeparatedKeyPhrases key phrases that identify slides about to be removed
* @param document document to remove slides from
* @param commaSeparatedKeyphrases keyphrases that identify slides about to be removed
*/
private void removeSlidesContainingAnyKeyPhrases(PDDocument document, String removeSlidesCommaSeparatedKeyPhrases) {
private void removeSlidesContainingAnyKeyPhrases(PDDocument document, String commaSeparatedKeyphrases) {
try {
PDFTextStripper pdfTextStripper = new PDFTextStripper();
Splitter pdfSplitter = new Splitter();
List<PDDocument> pages = pdfSplitter.split(document);
List<PDDocument> pages = pdfSinglePageSplitter.split(document);
List<String> keyphrasesList = getKeyphrasesFromString(commaSeparatedKeyphrases);

// Uses a decrementing loop (starting from the last index) to ensure that the
// index values are adjusted correctly when removing pages.
for (int index = pages.size() - 1; index >= 0; index--) {
PDDocument currentPage = pages.get(index);
String slideText = pdfTextStripper.getText(currentPage);
try (PDDocument currentPage = pages.get(index)) {
String slideText = pdfTextStripper.getText(currentPage);

if (slideContainsKeyphrase(slideText, removeSlidesCommaSeparatedKeyPhrases)) {
document.removePage(index);
if (slideContainsKeyphrase(slideText, keyphrasesList)) {
document.removePage(index);

Check warning on line 161 in src/main/java/de/tum/in/www1/artemis/service/LectureUnitProcessingService.java

View check run for this annotation

Teamscale / teamscale-findings

src/main/java/de/tum/in/www1/artemis/service/LectureUnitProcessingService.java#L161

This method is slightly nested [0]. Consider extracting helper methods or reducing the nesting by using early breaks or returns. [0] https://teamscale.io/findings.html#details/GitHub-ls1intum-Artemis?t=feature%2Flecture%2Fshow-removed-slide-numbers%3AHEAD&id=3EE5AAED06D6E2A4E8B4F1AA8679149F
}
}
currentPage.close(); // make sure to close the document
}
}
catch (IOException e) {
Expand All @@ -130,22 +169,22 @@
}
}

private boolean slideContainsKeyphrase(String slideText, String removeSlidesCommaSeparatedKeyPhrases) {
private boolean slideContainsKeyphrase(String slideText, List<String> keyphrasesList) {
String lowerCaseSlideText = slideText.toLowerCase();
return Arrays.stream(removeSlidesCommaSeparatedKeyPhrases.split(",")).anyMatch(keyphrase -> lowerCaseSlideText.contains(keyphrase.strip().toLowerCase()));
return keyphrasesList.stream().anyMatch(keyphrase -> lowerCaseSlideText.contains(keyphrase.strip().toLowerCase()));
}

/**
* Prepare information of split units for client
*
* @param file The file (lecture slide) to be split
* @param fileBytes The byte content of the file (lecture slides) to be split
* @return The prepared information of split units LectureUnitInformationDTO
*/
public LectureUnitInformationDTO getSplitUnitData(MultipartFile file) {
public LectureUnitInformationDTO getSplitUnitData(byte[] fileBytes) {

try {
log.debug("Start preparing information of split units for the file {}", file);
Outline unitsInformation = separateIntoUnits(file);
log.debug("Start preparing information of split units.");
Outline unitsInformation = separateIntoUnits(fileBytes);
Map<Integer, LectureUnitSplit> unitsDocumentMap = unitsInformation.splits;
int numberOfPages = unitsInformation.totalPages;

Expand All @@ -161,29 +200,56 @@
}
}

/**
* Temporarily saves a file that will be processed into lecture units.
*
* @param lectureId the id of the lecture the file belongs to
* @param file the file to be saved
* @param minutesUntilDeletion duration the file gets saved for
* @return the last part of the filename. Use {@link LectureUnitProcessingService#getPathForTempFilename(long, String) getPathForTempFilename}
* to get the full file path again.
*/
public String saveTempFileForProcessing(long lectureId, MultipartFile file, int minutesUntilDeletion) throws IOException {
String prefix = "Temp_" + lectureId + "_";
Path filePath = fileService.generateFilePath(prefix, FilenameUtils.getExtension(file.getOriginalFilename()), FilePathService.getTempFilePath());
FileUtils.copyInputStreamToFile(file.getInputStream(), filePath.toFile());
Dismissed Show dismissed Hide dismissed
fileService.schedulePathForDeletion(filePath, minutesUntilDeletion);
return filePath.getFileName().toString().substring(prefix.length());
}

/**
* Gets the path of the temporary file for a give lectureId and filename
*
* @param lectureId the id of the lecture the file belongs to
* @param filename the last part of the filename (timestamp and extension)
* @return Path of the file
*/
public Path getPathForTempFilename(long lectureId, String filename) {
String fullFilename = "Temp_" + lectureId + "_" + FileService.sanitizeFilename(filename);
return FilePathService.getTempFilePath().resolve(fullFilename);
Dismissed Show dismissed Hide dismissed
}

/**
* This method prepares a map with information on how the slide
* is going to be split. The map looks like the following:
* Map<OutlineNumber, (UnitName, StartPage, EndPage)>
*
* @param file The file (lecture pdf) to be split
* @param fileBytes The byte content of the file (lecture pdf) to be split
* @return The prepared map
*/
private Outline separateIntoUnits(MultipartFile file) throws IOException {
try (PDDocument document = Loader.loadPDF(file.getBytes())) {
private Outline separateIntoUnits(byte[] fileBytes) throws IOException {
try (PDDocument document = Loader.loadPDF(fileBytes)) {
Map<Integer, LectureUnitSplit> outlineMap = new HashMap<>();
Splitter pdfSplitter = new Splitter();
PDFTextStripper pdfStripper = new PDFTextStripper();
// split the document into single pages
List<PDDocument> pages = pdfSplitter.split(document);
List<PDDocument> pages = pdfSinglePageSplitter.split(document);
int numberOfPages = document.getNumberOfPages();
ListIterator<PDDocument> iterator = pages.listIterator();

int outlineCount = 0;
int index = 1;
while (iterator.hasNext()) {
PDDocument currentPage = iterator.next();
String slideText = pdfStripper.getText(currentPage);
String slideText = pdfTextStripper.getText(currentPage);

if (isOutlineSlide(slideText)) {
outlineCount++;
Expand Down Expand Up @@ -231,4 +297,11 @@
*/
private record Outline(Map<Integer, LectureUnitSplit> splits, int totalPages) {
}

/**
* parses a string containing comma-seperated keyphrases into a list of keyphrases.
*/
private List<String> getKeyphrasesFromString(String commaSeparatedKeyphrases) {
return Arrays.stream(commaSeparatedKeyphrases.split(",")).filter(s -> !s.isBlank()).toList();
}
}
Loading
Loading