Skip to content

Commit

Permalink
Initial commit; scrubbed code (passwords etc)
Browse files Browse the repository at this point in the history
  • Loading branch information
mitjat committed Nov 12, 2012
0 parents commit 1eb71e4
Show file tree
Hide file tree
Showing 50 changed files with 17,321 additions and 0 deletions.
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
clean_output.html
article_extractor/build/
log/
*.BROKEN.*
*.o
_crash_report.txt
dispatch/cache*
dispatch/output
copyright_removal/*.txt
copyright_removal/foo
/geo
/Scripts.txt
article_extractor/evaluation/
feed_mgmt/*
Binary file added article_extractor/HtmlCleanTagProb.dat
Binary file not shown.
Empty file added article_extractor/__init__.py
Empty file.
120 changes: 120 additions & 0 deletions article_extractor/article_extractor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#include <Python.h>
#include "base.h"
#include "htmlclean.h"
#include <stdio.h>

static TStr readFile(TStr path) {
FILE * pFile;
unsigned long lSize;
char * buffer;
size_t result;

pFile = fopen (path.CStr(), "rb");
if (pFile==NULL) {fputs ("File error",stderr); exit (1);}

// obtain file size:
fseek (pFile , 0 , SEEK_END);
lSize = ftell (pFile);
rewind (pFile);

// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*lSize);
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);}

// copy the file into the buffer:
result = fread (buffer,1,lSize,pFile);
if (result != lSize) {fputs ("Reading error",stderr); exit (3);}

/* the whole file is now loaded in the memory buffer. */
TStr ret(buffer);

// terminate
fclose (pFile);
free (buffer);

return ret;
}

int main() {
TStr html = readFile("sample.html");
int a=-1, b=-1;
THtmlClean *cleaner = new THtmlClean();
TStr txt;
txt = cleaner->Extract(html);
delete cleaner;
printf(">>%s<< %d %d\n", txt.CStr(), a, b);
}

TStr stripTags(TStr html) {
char *ret = new char[html.Len()+1]; int retLen=0;
char inQuotes=0;
bool inTag=false;
for (int i=0; i<html.Len(); i++) {
char c = html[i];
if (inTag) {
if (c=='\'' || c=='"') {
if (!inQuotes) inQuotes = c;
else if (c==inQuotes) inQuotes = false;
}
if (inQuotes==0 && c=='>') { inTag = false; continue; }
} else {
if (c=='<') inTag = true;
}

if (!inTag) {
ret[retLen++]=c;
}
}
ret[retLen] = '\0';
TStr retStr(ret);
delete[] ret;
return retStr;
}

extern "C" {

THtmlClean *cleaner;

static PyObject *
article_extractor_get_cleartext(PyObject *self, PyObject *args)
{
const char *html_c;
if (!PyArg_ParseTuple(args, "s", &html_c))
return NULL;
TStr html(html_c);
/*
html.ChangeStrAll("<p>","\n<p>");
html.ChangeStrAll("<p ","\n<p ");
html.ChangeStrAll("<br>","\n<br>");
html.ChangeStrAll("<br/>","\n<br/>");
html.ChangeStrAll("<br />","\n<br />");
*/
TStr txt = cleaner->Extract(html);
//txt = stripTags(txt);
if (txt.Len() > 100000) {
// the cleaner probably got it wrong
txt = "";
}
/*
txt.ChangeStrAll("\r\n","\n");
txt.ChangeStrAll("\r","\n");
while (txt.ChangeStrAll("\n ","\n")>0);
while (txt.ChangeStrAll("\n\n","\n")>0);
*/
return Py_BuildValue("s", txt.CStr());
}

static PyMethodDef PyMethods[] = {
{"get_cleartext", article_extractor_get_cleartext, METH_VARARGS,
"Given HTML of a news article, return cleartext of the article body."},
{NULL, NULL, 0, NULL} /* Sentinel */
};

PyMODINIT_FUNC
initarticle_extractor(void)
{
cleaner = new THtmlClean();
(void) Py_InitModule("article_extractor", PyMethods);
}

}
176 changes: 176 additions & 0 deletions article_extractor/article_extractor2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import re
import sys; sys.path.append('.')
import article_extractor
import sys; sys.path.append('..')
import util
import struct
import hashlib
import lxml.html, lxml.etree, lxml.html

htmlWhitespace = re.compile(r'(<br ?/? ?>|<p[ >])')
htmlTags = re.compile(r'<\s*/?\s*(\w+)[^>]*>')
htmlComments = re.compile(r'<!--.*?-->', re.DOTALL)
txtWhitespace = re.compile(r'[ \t]+')
multipleNewline = re.compile(r'(\n\s*)+')

def _load_copyright_ngrams(path):
"""
Return a set of ngram hashes read from `path` which should contain lines
with two space-separated numbers: n and n-gram hash.
Such files are produced by `dump_common_maximal()` in find_freq_ngrams.cpp.
"""
try:
with open(path) as f:
return set(int(line.split()[1]) for line in f)
except Exception, e:
print 'Warning: failed to load copyright-ngrams data from %r.' % path
print 'Reason:', e
return set()
stop_ngrams = _load_copyright_ngrams("../copyright_removal/freq_ngrams.txt")



def md5_64(txt):
"Lower 64 bits of md5. Cast as an uint64."
return struct.unpack("<Q", hashlib.md5(txt).digest()[8:])[0]

def remove_copyright(txt, n, stop_ngrams=stop_ngrams):
"""
Takes a cleartext version of a document (unicode) and removes all `n`-grams whose
`md5_64()` hashes are contained in `stop_ngrams`.
Detected n-grams can overlap.
Takes unicode, returns unicode.
"""
s = txt.encode('utf8')+' ' # C++ computes hashes on utf8-encoded strings.
a = b = 0 # Like in find_freq_ngrams.cpp, s[a:b] is the current n-gram
for i in range(n):
b = s.find(' ',b+1)
if b==-1: break
if b==-1: return txt # no ngrams, hence no changes

kill_ranges = [] # (start,end) character index spans that need to be removed
while b!=-1:
#print md5_64(s[a:b]), `s[a:b]`
# check if current n-gram needs to be removed
if md5_64(s[a:b]) in stop_ngrams:
if kill_ranges and kill_ranges[-1][1] >= a: kill_ranges[-1][1] = b+1
else: kill_ranges.append([a,b+1])
# advance to the next n-gram
a = s.find(' ',a)+1
b = s.find(' ',b+1)

if not kill_ranges: return txt # no changes

slices = [slice(0, kill_ranges[0][0])]
for i in range(len(kill_ranges)-1):
slices.append(slice(kill_ranges[i][1], kill_ranges[i+1][0]))
slices.append(slice(kill_ranges[-1][1], -1)) # -1 to remove the trailing space character
s = ''.join(s[slc] for slc in slices)

return s.decode('utf8', 'replace')

def get_cleartext(html, logger=None):
"""
Converts a full-page html (utf8) to the cleartext (unicode) containing just the article body.
The first line of the return value is the title (can be empty). If there was an
error or if the html is suspected not to contain an article, an empty string is returned.
`logger` should be None or a logging.Logger instance.
`html` is usually text (unicode or utf8) can also be a lxml tree; in that case, some heuristic
cleanup is performed first.
This calls the glib html->cleartext function, then does a bit of cleanup
and error checking.
"""

if type(html) == lxml.html.HtmlElement:
# time for heuristic cleanup
xDoc = html
if xDoc is None: return ''
for el in xDoc.findall('.//*'):
info = (el.get('id','')+':'+el.get('class','')).lower()
# if the element is suspicious, replace it with "barrier" (a bunch of <img> and <a> tags)
# that the C module is very unlikely to include in the result
if re.search('foot|header|^nav|naviga[ct]|[ck]omm?ent|dis[kc]us|user|notice|spe[cz]ial|about', info) \
and not re.search('main|article|content', info) and el.getparent() is not None:
idx = el.getparent().index(el)
el.getparent()[idx+1:idx+1] = [lxml.etree.fromstring('<a href="blah.com"><img src="http://shite.com" /></a>') for i in range(20)]
el.drop_tree()
html = lxml.etree.tostring(xDoc, encoding='utf8')


# If the output is very non-html-looking, don't bother with C++, it will only crash
if '\000' in html:
return ''

# Do the decoding, but watch out for weirdness in return values
txt = article_extractor.get_cleartext(html)
try:
txt = txt.decode('utf8')
except UnicodeDecodeError:
if logger:
logger.exception('Article %s was cleartexted, but cleartext was not in utf8. Saved cleartext to /tmp/non_utf8. Exception:')
try:
with open('/tmp/non_utf8','w') as f: f.write(txt)
except:
pass
txt=''

if len(txt) < 200:
# This can't be good/real
return ''

# Fix up the output. Step 1: remove HTML tags.
# TODO: Need to strip tags from titles as well (rss crawler, gnews crawler).
# Move some of the code below to util.py, reuse.
global txtr; txtr = txt # for debug
# Step 1a: small normalizations
txt = txt.rstrip('<') # glib output glitch; this is present only sometimes
txt = txt.replace('\r\n','\n').replace('\r','\n')
txt = htmlComments.sub('', txt)
txt = htmlWhitespace.sub(' \n\\1', txt)
# Step 1b: strip html tags (not elements!) except <script> and <style>
txt = htmlTags.sub(lambda m: m.group(0) if m.group(1).lower() in ('script','style') else '', txt)
# Step 1c: if any tags remain, they are bad. Remove them with lxml (cheap at this point).
if htmlTags.search(txt):
xRoot = lxml.html.fromstring(txt)
for c in xRoot:
xRoot.remove(c)
txt = xRoot.text_content()

# Step 2: decode HTML entities, normalize punctuation (e.g. weird quotes)
txt = util.normalizePunctuation(util.htmlUnescape(txt))

# Step 3: normalize whitespace
txt = multipleNewline.sub('\n', txt)
txt = txtWhitespace.sub(' ', txt)

# Step 4: add empty title (old articles in the db have the first row reserved for the title)
txt = '\n'+txt
# Step 5: remove copyrights and similar boilerplate
txt = remove_copyright(txt, 9)

if type(txt) != unicode:
print rstr
txt = txt.decode('utf8')
return txt

if __name__=='__main__':
print repr(remove_copyright(
'majhna sem bila, piske sem pasla piske so civkale jaz sem pa rasla',
3,
map(md5_64, ['sem bila, piske', 'piske sem pasla', 'jaz sem pa']),
))

import psycopg2, psycopg2.extras
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
conn = psycopg2.connect(database='news', host='maximus', user='mitjat', password='XXX_GITHUB_XXX')
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cur.execute("SET bytea_output TO 'escape'")

cur.execute("SELECT content FROM article WHERE feed_articleid=15008406 --29991787; --39606438; --3256641")
html = str(cur.fetchone()['content'])
txt = get_cleartext(html)
print '%d bytes html -> %d bytes text' % (len(html), len(txt))
print txt.encode('utf8')
Loading

0 comments on commit 1eb71e4

Please sign in to comment.