Skip to content

Commit

Permalink
Merge pull request #38 from BritishGeologicalSurvey/non-cli-convert
Browse files Browse the repository at this point in the history
Non cli convert
  • Loading branch information
volcan01010 authored Aug 27, 2021
2 parents 30b95bd + 681d4ac commit 40d9941
Show file tree
Hide file tree
Showing 12 changed files with 196 additions and 101 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
__pycache__
venv
.idea
.idea
.coverage
121 changes: 63 additions & 58 deletions app/ags.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
from functools import reduce
import logging
from pathlib import Path
import re
import subprocess
from typing import Tuple, Optional

import python_ags4
from python_ags4 import AGS4

from app.response_templates import PLAIN_TEXT_TEMPLATE

from app.response_templates import PLAIN_TEXT_TEMPLATE, RESPONSE_TEMPLATE

logger = logging.getLogger(__name__)

Expand All @@ -32,10 +30,7 @@ def validate(filename: Path, standard_AGS4_dictionary: Optional[str] = None) ->
logger.info("Validate called for %", filename.name)

# Prepare response with metadata
response = {'filename': filename.name,
'filesize': filename.stat().st_size,
'checker': f'python_ags4 v{python_ags4.__version__}',
'time': dt.datetime.now(tz=dt.timezone.utc)}
response = _prepare_response_metadata(filename)

# Select dictionary file if exists
if standard_AGS4_dictionary:
Expand All @@ -48,6 +43,11 @@ def validate(filename: Path, standard_AGS4_dictionary: Optional[str] = None) ->
else:
dictionary_file = None

# Return early if file is not .ags format
if filename.suffix != '.ags':
response['message'] = f"ERROR: {filename.name} is not .ags format"
return response

# Get error information from file
try:
errors = AGS4.check_file(filename, standard_AGS4_dictionary=dictionary_file)
Expand Down Expand Up @@ -89,44 +89,57 @@ def to_plain_text(response: dict) -> str:
return PLAIN_TEXT_TEMPLATE.render(response)


def convert(filename: Path, results_dir: Path) -> Tuple[Optional[Path], str]:
def convert(filename: Path, results_dir: Path) -> Tuple[Optional[Path], dict]:
"""
Convert filename between .ags and .xlsx. Write output to file in
results_dir and return path alongside processing log."""
results_dir and return path alongside job status data in dictionary."""
# Prepare variables and directory
new_extension = '.ags' if filename.suffix == '.xlsx' else '.xlsx'
converted_file = results_dir / (filename.stem + new_extension)
logger.info("Converting %s to %s", filename.name, converted_file.name)
if not results_dir.exists():
results_dir.mkdir()

args = [
'ags4_cli', 'convert', filename, converted_file
]
# Use subprocess to run the file. It will swallow errors.
# A timeout prevents the whole process hanging indefinitely.
result = subprocess.run(args, capture_output=True,
text=True, timeout=30)
logger.debug(result)
# Generate response based on result of subprocess
filesize = filename.stat().st_size / 1024
time_utc = dt.datetime.now(tz=dt.timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
if result.returncode != 0:
message = 'ERROR: ' + result.stderr
converted_file.unlink(missing_ok=True)
converted_file = None
elif result.stdout.startswith('ERROR: '):
message = result.stdout
converted_file.unlink(missing_ok=True)
converted_file = None
# Prepare response with metadata
response = _prepare_response_metadata(filename)

# Do the conversion
success = True
if filename.suffix == '.ags':
try:
AGS4.AGS4_to_excel(filename, converted_file)
except IndexError:
success = False
error_message = "ERROR: File does not have AGS format layout"
except UnboundLocalError:
# This error is thrown in response to a bug in the upstream code,
# which in turn is only triggered if the AGS file has duplicate
# headers.
success = False
error_message = "ERROR: File contains duplicate headers"
elif filename.suffix == '.xlsx':
try:
AGS4.excel_to_AGS4(filename, converted_file)
except AttributeError as err:
# Include error details here in case they provide a clue e.g. which
# attribute is missing
success = False
error_message = f"ERROR: Bad spreadsheet layout ({err.args[0]})"
else:
message = f"SUCCESS: {filename.name} converted to {converted_file.name}"
success = False
error_message = f"ERROR: {filename.name} is not .ags or .xlsx format"

log = RESPONSE_TEMPLATE.format(filename=filename.name,
filesize=filesize,
time_utc=time_utc,
message=message)
# Update response and clean failed files
if success:
response['message'] = f"SUCCESS: {filename.name} converted to {converted_file.name}"
response['valid'] = True
else:
response['message'] = error_message
response['valid'] = False
converted_file.unlink(missing_ok=True)
converted_file = None

return (converted_file, log)
return (converted_file, response)


def is_valid(filename: Path, standard_AGS4_dictionary: Optional[str] = None) -> bool:
Expand All @@ -136,30 +149,22 @@ def is_valid(filename: Path, standard_AGS4_dictionary: Optional[str] = None) ->
return validate(filename, standard_AGS4_dictionary=standard_AGS4_dictionary)['valid']


def get_unicode_message(stderr: str, filename: str) -> str:
def _prepare_response_metadata(filename: Path) -> dict:
"""
Generate useful message from Unicode error
Prepare a dictionary containing metadata to include in the response.
"""
m = re.search(r'.*in position (\d+):.*', stderr)
char_no, line_no, line, char = line_of_error(filename, int(m.group(1)))
message = f'ERROR: Unreadable character "{char}" at position {char_no} on line: {line_no}\nStarting: {line}\n\n'
return message

try:
filesize = filename.stat().st_size
except FileNotFoundError:
filesize = 0

def line_of_error(filename: Path, char_no: int) -> Tuple[int, int, str, str]:
"""
Return character, line number and start of line containing character at char_no.
Also return problem character
"""
with open(filename, encoding='ISO-8859-1') as f:
upto = f.read(char_no)
line_no = upto.count('\n') + 1
line = upto.split('\n')[-1]
char_no = len(line) + 1
char = f.read(1)
return char_no, line_no, line, char


class Ags4CliError(Exception):
"""Class for exceptions resulting from ags4_cli call."""
pass
response = {'filename': filename.name,
'filesize': filesize,
'checker': f'python_ags4 v{python_ags4.__version__}',
'time': dt.datetime.now(tz=dt.timezone.utc),
# The following are usually overwritten
'message': '',
'dictionary': '',
'errors': {},
'valid': False}
return response
10 changes: 0 additions & 10 deletions app/response_templates.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
# Text templates used to build responses.
from textwrap import dedent

from jinja2 import Template

RESPONSE_TEMPLATE = dedent("""
File Name: \t {filename}
File Size: \t {filesize:0.0f} kB
Time (UTC): \t {time_utc}
{message}
""").strip()

PLAIN_TEXT_TEMPLATE = Template("""
{{ filename }}: {{ message }}
Expand Down
5 changes: 3 additions & 2 deletions app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import List

from fastapi import APIRouter, BackgroundTasks, File, Form, Request, UploadFile
from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
from fastapi.responses import FileResponse, StreamingResponse

from app import ags
from app.errors import error_responses, InvalidPayloadError
Expand Down Expand Up @@ -173,7 +173,8 @@ async def convert_many(background_tasks: BackgroundTasks,
contents = await file.read()
local_file = tmp_dir / file.filename
local_file.write_bytes(contents)
converted, log = ags.convert(local_file, results_dir)
converted, result = ags.convert(local_file, results_dir)
log = ags.to_plain_text(result)
f.write(log)
f.write('\n' + '=' * 80 + '\n')
zipped_file = tmp_dir / RESULTS
Expand Down
File renamed without changes.
File renamed without changes.
Empty file added test/files/extension_is.bad
Empty file.
22 changes: 13 additions & 9 deletions test/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,31 @@
FROZEN_TIME = "2021-08-23 14:25:43"

ISVALID_RSP_DATA = [
('example1.ags', True),
('example_ags.ags', True),
('nonsense.ags', False),
('empty.ags', False),
('real/A3040_03.ags', False),
('example1.xlsx', False),
('example_xlsx.xlsx', False),
('random_binary.ags', False),
('real/CG014058_F.ags', False),
('real/Blackburn Southern Bypass.ags', False), # this file contains BOM character
('extension_is.bad', False),
]

GOOD_FILE_DATA = [
('example1.ags', 'SUCCESS: example1.ags converted to example1.xlsx'),
('example1.xlsx', 'SUCCESS: example1.xlsx converted to example1.ags'),
('example_ags.ags', ('SUCCESS: example_ags.ags converted to example_ags.xlsx', 'example_ags.xlsx')),
('example_xlsx.xlsx', ('SUCCESS: example_xlsx.xlsx converted to example_xlsx.ags', 'example_xlsx.ags')),
]

BAD_FILE_DATA = [
('nonsense.ags', ('IndexError: At least one sheet must be visible', 0)),
('empty.ags', ('IndexError: At least one sheet must be visible', 0)),
('dummy.xlsx', ("AttributeError: 'DataFrame' object has no attribute 'HEADING'", 5)),
('random_binary.ags', ('IndexError: At least one sheet must be visible', 1)),
('real/A3040_03.ags', ("UnboundLocalError: local variable 'group' referenced before assignment", 258)),
('nonsense.ags', ('ERROR: File does not have AGS format layout', 9)),
('empty.ags', ('ERROR: File does not have AGS format layout', 0)),
('dummy.xlsx', ("ERROR: Bad spreadsheet layout ('DataFrame' object has no attribute 'HEADING')", 4787)),
('random_binary.ags', ('ERROR: File does not have AGS format layout', 1024)),
('real/A3040_03.ags', ("ERROR: File contains duplicate headers", 264526)),
('extension_is.bad', ("ERROR: extension_is.bad is not .ags or .xlsx format", 0)),
# This file crashes because it asks for user input
# ('real/E52A4379 (2).ags', ("ERROR: File contains duplicate headers", 0))
]

DICTIONARIES = {
Expand Down
17 changes: 14 additions & 3 deletions test/fixtures_json.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import datetime as dt

JSON_RESPONSES = {
'example1.ags': {
'filename': 'example1.ags',
'example_ags.ags': {
'filename': 'example_ags.ags',
'filesize': 4039,
'checker': 'python_ags4 v0.3.6',
'dictionary': 'Standard_dictionary_v4_1.ags',
Expand Down Expand Up @@ -21102,7 +21102,18 @@
'double quotes.',
'group': '',
'line': 2081}]},
'valid': False}
'valid': False},
'extension_is.bad': {
'filename': 'extension_is.bad',
'filesize': 0,
'checker': 'python_ags4 v0.3.6',
'dictionary': '',
'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc),
'message': 'ERROR: extension_is.bad is not .ags format',
'errors': {},
'valid': False
},

}

# These response values break the schema
Expand Down
4 changes: 2 additions & 2 deletions test/fixtures_plain_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
PLAIN_TEXT_RESPONSES = {
'example1.ags': """
example1.ags: All checks passed!
'example_ags.ags': """
example_ags.ags: All checks passed!
# Metadata
Expand Down
Loading

0 comments on commit 40d9941

Please sign in to comment.