-
Notifications
You must be signed in to change notification settings - Fork 1
/
detect.py
46 lines (38 loc) · 1.54 KB
/
detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json
import re
import spacy
from furl import furl
class DetectionHandler:
def __init__(self, social_platforms_file):
with open(social_platforms_file, 'r') as file:
self.social_patterns = json.load(file)
self.nlp = spacy.load("en_core_web_sm")
def enhanced_detection(self, page_content, query, types_keywords):
detection_result = {type_key: False for type_key in types_keywords}
query_mentions = 0
# Analyze page content with spaCy
doc = self.nlp(page_content)
for type_key, keywords in types_keywords.items():
if any(keyword.lower() in page_content.lower() for keyword in keywords):
detection_result[type_key] = True
query_mentions = page_content.lower().count(query.lower())
social_platforms_detected = []
for platform, pattern in self.social_patterns.items():
if re.search(pattern, page_content):
social_platforms_detected.append(platform)
return {
"is_forum": detection_result.get("forum", False),
"is_news": detection_result.get("news", False),
"query_mentions": query_mentions,
"social_platforms_detected": social_platforms_detected,
"entities": {ent.text: ent.label_ for ent in doc.ents}
}
def parse_url(self, url):
f = furl(url)
return {
'scheme': f.scheme,
'host': f.host,
'path': f.path,
'query': f.args,
'fragment': f.fragment,
}