Skip to content

Commit

Permalink
overview documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
JingQunCui committed Dec 5, 2023
1 parent 285538e commit 942ce96
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
6 changes: 5 additions & 1 deletion ACMAS/app/ACMAS_Web/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
- When a file is uploaded to the filesystem, conduct OCR to extract and create text file
- Check for name availability with created text file
- Store text file on filesystem and database
Overview:
- Issues: Had trouble installing PyMuPDF with python alpine in docker containers
- Solution: Installed from source and made custom wheel for module package
"""
class OCR:
def extract_text_from_pdf(self, fType, course, fileName, fileUrl):
Expand All @@ -19,7 +23,7 @@ def extract_text_from_pdf(self, fType, course, fileName, fileUrl):
removeExt = os.path.splitext(fileName)[0]
txt_file_name = removeExt + ".txt"
txt_file_path = os.path.join(settings.MEDIA_ROOT, txt_file_name)
text = ''
text = ""
pdf_document = fitz.open(fileUrl)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
Expand Down
3 changes: 1 addition & 2 deletions ACMAS/app/ACMAS_Web/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ def uploadFile(self, uni, course, fType, file):
file_url = fs.url(savedFile) # Retrieve the file path
print(f'FILE "{savedFile}" uploaded to "{file_url}"\n')

ocrObject = OCR
ocrObject.extract_text_from_pdf(fType, course, fileName, file_url)
OCR().extract_text_from_pdf(fType, course, fileName, file_url)

# Adding file to database
db_file = UploadedFile(
Expand Down

0 comments on commit 942ce96

Please sign in to comment.