-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit; scrubbed code (passwords etc)
- Loading branch information
0 parents
commit 1eb71e4
Showing
50 changed files
with
17,321 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
clean_output.html | ||
article_extractor/build/ | ||
log/ | ||
*.BROKEN.* | ||
*.o | ||
_crash_report.txt | ||
dispatch/cache* | ||
dispatch/output | ||
copyright_removal/*.txt | ||
copyright_removal/foo | ||
/geo | ||
/Scripts.txt | ||
article_extractor/evaluation/ | ||
feed_mgmt/* |
Binary file not shown.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#include <Python.h> | ||
#include "base.h" | ||
#include "htmlclean.h" | ||
#include <stdio.h> | ||
|
||
static TStr readFile(TStr path) { | ||
FILE * pFile; | ||
unsigned long lSize; | ||
char * buffer; | ||
size_t result; | ||
|
||
pFile = fopen (path.CStr(), "rb"); | ||
if (pFile==NULL) {fputs ("File error",stderr); exit (1);} | ||
|
||
// obtain file size: | ||
fseek (pFile , 0 , SEEK_END); | ||
lSize = ftell (pFile); | ||
rewind (pFile); | ||
|
||
// allocate memory to contain the whole file: | ||
buffer = (char*) malloc (sizeof(char)*lSize); | ||
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);} | ||
|
||
// copy the file into the buffer: | ||
result = fread (buffer,1,lSize,pFile); | ||
if (result != lSize) {fputs ("Reading error",stderr); exit (3);} | ||
|
||
/* the whole file is now loaded in the memory buffer. */ | ||
TStr ret(buffer); | ||
|
||
// terminate | ||
fclose (pFile); | ||
free (buffer); | ||
|
||
return ret; | ||
} | ||
|
||
int main() { | ||
TStr html = readFile("sample.html"); | ||
int a=-1, b=-1; | ||
THtmlClean *cleaner = new THtmlClean(); | ||
TStr txt; | ||
txt = cleaner->Extract(html); | ||
delete cleaner; | ||
printf(">>%s<< %d %d\n", txt.CStr(), a, b); | ||
} | ||
|
||
TStr stripTags(TStr html) { | ||
char *ret = new char[html.Len()+1]; int retLen=0; | ||
char inQuotes=0; | ||
bool inTag=false; | ||
for (int i=0; i<html.Len(); i++) { | ||
char c = html[i]; | ||
if (inTag) { | ||
if (c=='\'' || c=='"') { | ||
if (!inQuotes) inQuotes = c; | ||
else if (c==inQuotes) inQuotes = false; | ||
} | ||
if (inQuotes==0 && c=='>') { inTag = false; continue; } | ||
} else { | ||
if (c=='<') inTag = true; | ||
} | ||
|
||
if (!inTag) { | ||
ret[retLen++]=c; | ||
} | ||
} | ||
ret[retLen] = '\0'; | ||
TStr retStr(ret); | ||
delete[] ret; | ||
return retStr; | ||
} | ||
|
||
extern "C" { | ||
|
||
THtmlClean *cleaner; | ||
|
||
static PyObject * | ||
article_extractor_get_cleartext(PyObject *self, PyObject *args) | ||
{ | ||
const char *html_c; | ||
if (!PyArg_ParseTuple(args, "s", &html_c)) | ||
return NULL; | ||
TStr html(html_c); | ||
/* | ||
html.ChangeStrAll("<p>","\n<p>"); | ||
html.ChangeStrAll("<p ","\n<p "); | ||
html.ChangeStrAll("<br>","\n<br>"); | ||
html.ChangeStrAll("<br/>","\n<br/>"); | ||
html.ChangeStrAll("<br />","\n<br />"); | ||
*/ | ||
TStr txt = cleaner->Extract(html); | ||
//txt = stripTags(txt); | ||
if (txt.Len() > 100000) { | ||
// the cleaner probably got it wrong | ||
txt = ""; | ||
} | ||
/* | ||
txt.ChangeStrAll("\r\n","\n"); | ||
txt.ChangeStrAll("\r","\n"); | ||
while (txt.ChangeStrAll("\n ","\n")>0); | ||
while (txt.ChangeStrAll("\n\n","\n")>0); | ||
*/ | ||
return Py_BuildValue("s", txt.CStr()); | ||
} | ||
|
||
static PyMethodDef PyMethods[] = { | ||
{"get_cleartext", article_extractor_get_cleartext, METH_VARARGS, | ||
"Given HTML of a news article, return cleartext of the article body."}, | ||
{NULL, NULL, 0, NULL} /* Sentinel */ | ||
}; | ||
|
||
PyMODINIT_FUNC | ||
initarticle_extractor(void) | ||
{ | ||
cleaner = new THtmlClean(); | ||
(void) Py_InitModule("article_extractor", PyMethods); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
import re | ||
import sys; sys.path.append('.') | ||
import article_extractor | ||
import sys; sys.path.append('..') | ||
import util | ||
import struct | ||
import hashlib | ||
import lxml.html, lxml.etree, lxml.html | ||
|
||
htmlWhitespace = re.compile(r'(<br ?/? ?>|<p[ >])') | ||
htmlTags = re.compile(r'<\s*/?\s*(\w+)[^>]*>') | ||
htmlComments = re.compile(r'<!--.*?-->', re.DOTALL) | ||
txtWhitespace = re.compile(r'[ \t]+') | ||
multipleNewline = re.compile(r'(\n\s*)+') | ||
|
||
def _load_copyright_ngrams(path): | ||
""" | ||
Return a set of ngram hashes read from `path` which should contain lines | ||
with two space-separated numbers: n and n-gram hash. | ||
Such files are produced by `dump_common_maximal()` in find_freq_ngrams.cpp. | ||
""" | ||
try: | ||
with open(path) as f: | ||
return set(int(line.split()[1]) for line in f) | ||
except Exception, e: | ||
print 'Warning: failed to load copyright-ngrams data from %r.' % path | ||
print 'Reason:', e | ||
return set() | ||
stop_ngrams = _load_copyright_ngrams("../copyright_removal/freq_ngrams.txt") | ||
|
||
|
||
|
||
def md5_64(txt): | ||
"Lower 64 bits of md5. Cast as an uint64." | ||
return struct.unpack("<Q", hashlib.md5(txt).digest()[8:])[0] | ||
|
||
def remove_copyright(txt, n, stop_ngrams=stop_ngrams): | ||
""" | ||
Takes a cleartext version of a document (unicode) and removes all `n`-grams whose | ||
`md5_64()` hashes are contained in `stop_ngrams`. | ||
Detected n-grams can overlap. | ||
Takes unicode, returns unicode. | ||
""" | ||
s = txt.encode('utf8')+' ' # C++ computes hashes on utf8-encoded strings. | ||
a = b = 0 # Like in find_freq_ngrams.cpp, s[a:b] is the current n-gram | ||
for i in range(n): | ||
b = s.find(' ',b+1) | ||
if b==-1: break | ||
if b==-1: return txt # no ngrams, hence no changes | ||
|
||
kill_ranges = [] # (start,end) character index spans that need to be removed | ||
while b!=-1: | ||
#print md5_64(s[a:b]), `s[a:b]` | ||
# check if current n-gram needs to be removed | ||
if md5_64(s[a:b]) in stop_ngrams: | ||
if kill_ranges and kill_ranges[-1][1] >= a: kill_ranges[-1][1] = b+1 | ||
else: kill_ranges.append([a,b+1]) | ||
# advance to the next n-gram | ||
a = s.find(' ',a)+1 | ||
b = s.find(' ',b+1) | ||
|
||
if not kill_ranges: return txt # no changes | ||
|
||
slices = [slice(0, kill_ranges[0][0])] | ||
for i in range(len(kill_ranges)-1): | ||
slices.append(slice(kill_ranges[i][1], kill_ranges[i+1][0])) | ||
slices.append(slice(kill_ranges[-1][1], -1)) # -1 to remove the trailing space character | ||
s = ''.join(s[slc] for slc in slices) | ||
|
||
return s.decode('utf8', 'replace') | ||
|
||
def get_cleartext(html, logger=None): | ||
""" | ||
Converts a full-page html (utf8) to the cleartext (unicode) containing just the article body. | ||
The first line of the return value is the title (can be empty). If there was an | ||
error or if the html is suspected not to contain an article, an empty string is returned. | ||
`logger` should be None or a logging.Logger instance. | ||
`html` is usually text (unicode or utf8) can also be a lxml tree; in that case, some heuristic | ||
cleanup is performed first. | ||
This calls the glib html->cleartext function, then does a bit of cleanup | ||
and error checking. | ||
""" | ||
|
||
if type(html) == lxml.html.HtmlElement: | ||
# time for heuristic cleanup | ||
xDoc = html | ||
if xDoc is None: return '' | ||
for el in xDoc.findall('.//*'): | ||
info = (el.get('id','')+':'+el.get('class','')).lower() | ||
# if the element is suspicious, replace it with "barrier" (a bunch of <img> and <a> tags) | ||
# that the C module is very unlikely to include in the result | ||
if re.search('foot|header|^nav|naviga[ct]|[ck]omm?ent|dis[kc]us|user|notice|spe[cz]ial|about', info) \ | ||
and not re.search('main|article|content', info) and el.getparent() is not None: | ||
idx = el.getparent().index(el) | ||
el.getparent()[idx+1:idx+1] = [lxml.etree.fromstring('<a href="blah.com"><img src="http://shite.com" /></a>') for i in range(20)] | ||
el.drop_tree() | ||
html = lxml.etree.tostring(xDoc, encoding='utf8') | ||
|
||
|
||
# If the output is very non-html-looking, don't bother with C++, it will only crash | ||
if '\000' in html: | ||
return '' | ||
|
||
# Do the decoding, but watch out for weirdness in return values | ||
txt = article_extractor.get_cleartext(html) | ||
try: | ||
txt = txt.decode('utf8') | ||
except UnicodeDecodeError: | ||
if logger: | ||
logger.exception('Article %s was cleartexted, but cleartext was not in utf8. Saved cleartext to /tmp/non_utf8. Exception:') | ||
try: | ||
with open('/tmp/non_utf8','w') as f: f.write(txt) | ||
except: | ||
pass | ||
txt='' | ||
|
||
if len(txt) < 200: | ||
# This can't be good/real | ||
return '' | ||
|
||
# Fix up the output. Step 1: remove HTML tags. | ||
# TODO: Need to strip tags from titles as well (rss crawler, gnews crawler). | ||
# Move some of the code below to util.py, reuse. | ||
global txtr; txtr = txt # for debug | ||
# Step 1a: small normalizations | ||
txt = txt.rstrip('<') # glib output glitch; this is present only sometimes | ||
txt = txt.replace('\r\n','\n').replace('\r','\n') | ||
txt = htmlComments.sub('', txt) | ||
txt = htmlWhitespace.sub(' \n\\1', txt) | ||
# Step 1b: strip html tags (not elements!) except <script> and <style> | ||
txt = htmlTags.sub(lambda m: m.group(0) if m.group(1).lower() in ('script','style') else '', txt) | ||
# Step 1c: if any tags remain, they are bad. Remove them with lxml (cheap at this point). | ||
if htmlTags.search(txt): | ||
xRoot = lxml.html.fromstring(txt) | ||
for c in xRoot: | ||
xRoot.remove(c) | ||
txt = xRoot.text_content() | ||
|
||
# Step 2: decode HTML entities, normalize punctuation (e.g. weird quotes) | ||
txt = util.normalizePunctuation(util.htmlUnescape(txt)) | ||
|
||
# Step 3: normalize whitespace | ||
txt = multipleNewline.sub('\n', txt) | ||
txt = txtWhitespace.sub(' ', txt) | ||
|
||
# Step 4: add empty title (old articles in the db have the first row reserved for the title) | ||
txt = '\n'+txt | ||
# Step 5: remove copyrights and similar boilerplate | ||
txt = remove_copyright(txt, 9) | ||
|
||
if type(txt) != unicode: | ||
print rstr | ||
txt = txt.decode('utf8') | ||
return txt | ||
|
||
if __name__=='__main__': | ||
print repr(remove_copyright( | ||
'majhna sem bila, piske sem pasla piske so civkale jaz sem pa rasla', | ||
3, | ||
map(md5_64, ['sem bila, piske', 'piske sem pasla', 'jaz sem pa']), | ||
)) | ||
|
||
import psycopg2, psycopg2.extras | ||
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) | ||
conn = psycopg2.connect(database='news', host='maximus', user='mitjat', password='XXX_GITHUB_XXX') | ||
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) | ||
cur.execute("SET bytea_output TO 'escape'") | ||
|
||
cur.execute("SELECT content FROM article WHERE feed_articleid=15008406 --29991787; --39606438; --3256641") | ||
html = str(cur.fetchone()['content']) | ||
txt = get_cleartext(html) | ||
print '%d bytes html -> %d bytes text' % (len(html), len(txt)) | ||
print txt.encode('utf8') |
Oops, something went wrong.