Skip to content

Commit

Permalink
plaintext/shellcmd: get rid of shell=True, cleanup grep commands
Browse files Browse the repository at this point in the history
  • Loading branch information
karlicoss committed Feb 16, 2021
1 parent 8ec3a24 commit 3a3c412
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 34 deletions.
1 change: 1 addition & 0 deletions src/promnesia/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

# TLDR: py37 on windows has an annoying bug.. https://github.com/karlicoss/promnesia/issues/91#issuecomment-701051074
def _fix(args: Paths) -> List[str]:
assert not isinstance(args, str), args # just to prevent shell=True calls...
return list(map(str, args))

import subprocess
Expand Down
61 changes: 37 additions & 24 deletions src/promnesia/sources/plaintext.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,57 @@
from ..common import get_logger, get_tmpdir, PathIsh, _is_windows

from pathlib import Path

from shlex import quote
from typing import List

# https://linux-and-mac-hacks.blogspot.co.uk/2013/04/use-grep-and-regular-expressions-to.html
_URL_REGEX = r'\b(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]'

# -n to output line numbers so we could restore context
# -I to ignore binaries
# TODO on findows use 'find'?
# FIXME path should be escaped..
_GREP_CMD = r"""grep --color=never -E -I {grep_args} --exclude-dir=".git" '{regex}' {path} || true"""

# TODO hmm.. this might result in errors because grep exits with 1 if no matches were found... sigh
def _grep(*, paths: List[str], recursive: bool) -> List[str]:
return [
'grep',
'--color=never',
*(['-r'] if recursive else []),
'-n', # print line numbers (to restore context)
'-I', # ignore binaries
'--exclude-dir=".git"',
'-E', # 'extended' syntax
_URL_REGEX,
*paths,
]

def _findstr(*, path: str, recursive: bool) -> List[str]:
return [
'findstr',
'/S',
'/P',
'/N',
'https*://',
path + (r'\*' if recursive else ''),
]

def _extract_from_dir(path: str) -> str:

def _extract_from_dir(path: str) -> List[str]:
if _is_windows:
return fr'''findstr /S /P /N "https*://" "{path}\*"'''
return _findstr(path=path, recursive=True)

return _GREP_CMD.format(
grep_args="-r -n",
regex=_URL_REGEX,
path=path, # TODO quote here too?
return _grep(
paths=[path],
recursive=True,
)

def _extract_from_file(path: str) -> str:
def _extract_from_file(path: str) -> List[str]:
if _is_windows:
# /P to skip non-printable
# /N to print line number
# /S to print filename
return f'''findstr /S /P /N "https*://" "{path}"'''
return _findstr(path=path, recursive=False)

return _GREP_CMD.format(
grep_args="-n",
regex=_URL_REGEX,
path=f"{quote(path)} /dev/null", # dev null to trick into displaying filename
return _grep(
paths=[path, '/dev/null'], # dev/null to trick into displaying filename
recursive=False,
)

# TODO eh, misleading..
def extract_from_path(path: PathIsh) -> str:

def extract_from_path(path: PathIsh) -> List[str]:
pp = Path(path)

tdir = get_tmpdir()
Expand Down
27 changes: 18 additions & 9 deletions src/promnesia/sources/shellcmd.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
from datetime import datetime
import re
from ..compat import check_call, check_output
from typing import Optional
from urllib.parse import unquote
from ..compat import check_output, Paths
from typing import Optional, Union
import warnings

from ..common import Visit, Loc, Results, extract_urls, file_mtime, get_system_tz, now_tz, _is_windows
from ..common import Visit, Loc, Results, extract_urls, file_mtime, get_system_tz, now_tz


def index(command: str) -> Results:
def index(command: Union[str, Paths]) -> Results:
cmd: Paths
cmds: str
if isinstance(command, str):
cmds = command
warnings.warn("Passing string as a command is very fragile('{command}'). Please use list instead.")
cmd = command.split(' ')
else:
cmds = ' '.join(map(str, command))
cmd = command

tz = get_system_tz()

def handle_line(line: str) -> Results:
# grep dumps this as
# /path/to/file:lineno:rest
# note: on Windows, path contains :...
# note: on Windows, path contains : after the disk name..
m = re.search(r'(.*):(\d+):(.*)', line)
if m is None:
# todo warn maybe?
Expand All @@ -37,7 +47,7 @@ def handle_line(line: str) -> Results:
loc = Loc.file(fname, line=lineno)
else:
ts = now_tz()
loc = Loc.make(command)
loc = Loc.make(cmds)
for url in urls:
yield Visit(
url=url,
Expand All @@ -46,8 +56,7 @@ def handle_line(line: str) -> Results:
context=context,
)

# FIXME remove shell=True...
output = check_output(command, shell=True)
output = check_output(cmd)
lines = [line.decode('utf-8') for line in output.splitlines()]
for line in lines:
try:
Expand Down
3 changes: 2 additions & 1 deletion tests/indexer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ def test_custom() -> None:

visits = as_visits(W(
custom_gen.index,
"""grep -Eo -r --no-filename '(http|https)://\S+' """ + tdata('custom'),
# meh. maybe should deprecate plain string here...
"""grep -Eo -r --no-filename (http|https)://\S+ """ + tdata('custom'),
))
# TODO I guess filtering of equivalent urls should rather be tested on something having context (e.g. org mode)
assert len(visits) == 5
Expand Down

0 comments on commit 3a3c412

Please sign in to comment.