Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adiciona python-indexer #3317

Open
wants to merge 7 commits into
base: 3.1.x
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions sapl-logs/python-indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import logging
import sys
import requests
import json
import time
import re

# TODO: inserir timestamp no logging do python-indexer.py

USE_SOLR = os.getenv('USE_SOLR', True) # TODO: trocar por False em produção
rjoao marked this conversation as resolved.
Show resolved Hide resolved
SOLR_BASE_URL = os.getenv('SOLR_URL', 'http://localhost:8983') + '/solr'

SOLR_UPDATE_URL = f'{SOLR_BASE_URL}/sapl-logs/update?commitWithin=1000'

SOLR_COLLECTION_STATUS = (
f'{SOLR_BASE_URL}/sapl-logs/admin/ping?distrib=true&wt=json'
)

BATCH_SIZE = 10 # https://lucidworks.com/post/really-batch-updates-solr-2/
rjoao marked this conversation as resolved.
Show resolved Hide resolved

previous = None

buffer = []
payload = []

num_docs = 0
total_docs = 0

# logging setup
logfilename = 'python-indexer.log'

logging.basicConfig(
filename=logfilename,
filemode='w+',
level=logging.INFO
)

logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logger = logging.getLogger('python-indexer')
logger.setLevel(logging.DEBUG)

print(f"The logging of this program is done at {logfilename}")


def push_to_solr():
logger.debug(f"Sending {len(payload)} documents to Solr")

r = requests.post(
SOLR_UPDATE_URL,
data=json.dumps(payload),
headers={'Content-Type': 'application/json; charset=utf-8'}
)
rjoao marked this conversation as resolved.
Show resolved Hide resolved
logger.debug(r.content)
rjoao marked this conversation as resolved.
Show resolved Hide resolved


def parse_fields(groups):
from datetime import datetime as dt

iso8601 = "{} {}".format(groups[1], groups[2].replace(",", "."))
d = dt.fromisoformat(iso8601)
datetime = d.strftime('%Y-%m-%dT%H:%M:%SZ')

# datetime = groups[1] + "T" + groups[2].split(',')[0] + "Z"

fields = {
'level': groups[0],
'datetime': datetime
}

parts = groups[3].split()
fields['server'] = parts[0]
fields['path'] = parts[1]

# format: sapl.painel.views:get_votos:497
function = parts[2].split(':')
fields['app_file'] = function[0]
fields['method'] = function[1]
fields['line_number'] = function[2]
fields['function'] = parts[2]

fields['message'] = ' '.join(parts[3:])

return fields


def parse_logs(line):
global previous

# discard empty lines
if not line.strip():
return

pattern = (
"^(ERROR|INFO|DEBUG|WARNING)" +
r'\s+(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2}:\d{2},\d+)\s+(.*)'
)
match = re.match(pattern, line)

if match:
groups = match.groups()
fields = parse_fields(groups)
fields['line'] = line

# if match but buffer is full then there was a stack trace before
if buffer and previous:
previous['stacktrace'] = ''.join(buffer)
buffer.clear()
elif not previous:
buffer.clear() # un-garbaged trash

# append the previous one
if previous:
payload.append(previous)

# delay append of current (it may have stacktrace)
previous = fields
else:
# while not match again collect into buffer
buffer.append(line)

logger.debug(len(payload))


def follow(fd):
""" generator function that yields new lines in a file """

# seek the end of the file
fd.seek(0, os.SEEK_END)

# start infinite loop
while True:
# read last line of file
line = fd.readline()
# sleep if file hasn't been updated
if not line:
time.sleep(0.1)
continue

yield line


def check_solr():
try:
r = requests.get(SOLR_BASE_URL)
if r.status_code == 200:
print(f"Solr server at {SOLR_BASE_URL} is up and running...")

print("Checking collection health...")

r = requests.get(SOLR_COLLECTION_STATUS)
data = r.json()
if data['status'] == 'OK':
rjoao marked this conversation as resolved.
Show resolved Hide resolved
print("Collection sapl-logs is healthy")

except Exception as e:
logger.error(
"Exception: " + str(e) +
f"\nError connecting to Solr at {SOLR_COLLECTION_STATUS}"
)
sys.exit(1)
rjoao marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == '__main__':

if not USE_SOLR:
print(f"USE_SOLR={USE_SOLR}")
sys.exit(0)

check_solr()

filename = sys.argv[1]
print(f"Opening log file {filename}...")
logfile = open(filename, 'r')
loglines = follow(logfile)

# iterate over the generator
for line in loglines:
logger.debug(f"Current payload size: {len(payload)}")
parse_logs(line)

num_docs = (num_docs + 1) % BATCH_SIZE
if num_docs == 0 and payload:
push_to_solr()
total_docs += len(payload)
payload.clear()

push_to_solr()