-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
76 lines (67 loc) · 2.05 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from elasticsearch import Elasticsearch
import sys
import ipdb
import os
from pprint import pprint
from git import Repo
import re
INDEX_NAME=sys.argv[1]
TYPE_NAME=sys.argv[2]
TYPE_NAME_FUNC=sys.argv[3]
ELASTICSEARCH_URL=sys.argv[4]
REPO=sys.argv[5]
PRIVATE_PATH=sys.argv[6]
PREVIEW_SIZE = 2000
FULL_SIZE = 10000
es = Elasticsearch([ELASTICSEARCH_URL])
def get_paths(path, repo_url):
for root, dirs, files in os.walk(path, topdown=False):
#implement ignoring many files here
full_root = root
root = root.split(path)[1]
if root and root[0] == ".":
continue
for name in files:
if name and name[0] == '.':
continue
path_name = os.path.join(root, name)
full_path_name = os.path.join(full_root, name)
with open(full_path_name, 'r') as f:
read_data = f.read(FULL_SIZE)
line_number = 0
f.seek(0)
for line in f:
line_number += 1
function_name = find_function(line)
if function_name:
body2 = {"path": path_name, "function_name": function_name, "line_number":line_number, "repo_url" : repo_url}
es.index(index=INDEX_NAME, doc_type=TYPE_NAME_FUNC, body=body2)
body = {"path":path_name, "name":name, "body":read_data, "body_preview": read_data[:PREVIEW_SIZE], "repo_url" : repo_url }
es.index(index=INDEX_NAME, doc_type=TYPE_NAME, body=body)
# def read_in_chunks(file_object, chunk_size=1024):
# """Lazy function (generator) to read a file piece by piece.
# Default chunk size: 1k."""
# while True:
# data = file_object.read(chunk_size)
# if not data:
# break
# yield data
def find_function(line):
regex = re.compile("\sdef\s(.+)\s")
function_name = regex.findall(line)
if function_name:
if '(' in function_name[0]:
return function_name[0].split('(')[0]
else:
return function_name[0].split(" ")[0]
else:
return None
def clone_repo(repo_dir):
repo_url = "https://github.com/" + REPO
Repo(repo_url).clone(repo_dir)
def main():
repo_dir = PRIVATE_PATH + "/" + REPO.split("/")[0]
clone_repo(repo_dir)
#get_paths(repo_dir, REPO)
if __name__ == '__main__':
main()