Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

Commit

Permalink
Search by language, URL-host networks, fixes to media frequency module
Browse files Browse the repository at this point in the history
  • Loading branch information
Emile den Tex committed Jan 15, 2019
1 parent f40455f commit e75ca2b
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 7 deletions.
33 changes: 30 additions & 3 deletions analysis/common/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,15 @@
$keywordToTrack = "";

if (isset($_GET['from_user_lang']) && !empty($_GET['from_user_lang']))
$from_user_lang = trim(strtolower($_GET['from_user_lang']));
$from_user_lang = trim($_GET['from_user_lang']);
else
$from_user_lang = "";

if (isset($_GET['lang']) && !empty($_GET['lang']))
$lang = trim($_GET['lang']);
else
$lang = "";

if (isset($_GET['minimumCowordFrequencyOverall']))
$minimumCowordFrequencyOverall = $_GET['minimumCowordFrequencyOverall'];
else
Expand Down Expand Up @@ -427,6 +432,25 @@ function sqlSubset($where = NULL) {
$sql .= "from_user_lang = '" . $esc['mysql']['from_user_lang'] . "' AND ";
}
}
if (!empty($esc['mysql']['lang'])) {
if (strstr($esc['mysql']['lang'], "AND") !== false) {
$subqueries = explode(" AND ", $esc['mysql']['lang']);
foreach ($subqueries as $subquery) {
$sql .= "lang = '" . $subquery . "' AND ";
}
} elseif (strstr($esc['mysql']['lang'], "OR") !== false) {
$subqueries = explode(" OR ", $esc['mysql']['lang']);
$sql .= "(";
foreach ($subqueries as $subquery) {
$sql .= "lang = '" . $subquery . "' OR ";
}
$sql = substr($sql, 0, -3) . ") AND ";
} else {
$sql .= "lang = '" . $esc['mysql']['lang'] . "' AND ";
}
}



$sql .= " t.created_at >= '" . $esc['datetime']['startdate'] . "' AND t.created_at <= '" . $esc['datetime']['enddate'] . "' ";
//print $sql."<br>"; die;
Expand Down Expand Up @@ -773,7 +797,7 @@ function decodeAndFlatten($text) {
// make sure that we have all the right types and values
// also make sure one cannot do a mysql injection attack
function validate_all_variables() {
global $esc, $query, $url_query, $media_url_query, $geo_query, $dataset, $exclude, $from_user_name, $exclude_from_user_name, $from_user_description, $from_source, $startdate, $enddate, $interval, $databases, $connection, $keywords, $database, $minf, $topu, $from_user_lang, $outputformat;
global $esc, $query, $url_query, $media_url_query, $geo_query, $dataset, $exclude, $from_user_name, $exclude_from_user_name, $from_user_description, $from_source, $startdate, $enddate, $interval, $databases, $connection, $keywords, $database, $minf, $topu, $from_user_lang, $lang, $outputformat;

$esc['mysql']['dataset'] = validate($dataset, "mysql-literal");
$esc['mysql']['query'] = validate($query, "mysql-literal");
Expand All @@ -786,6 +810,7 @@ function validate_all_variables() {
$esc['mysql']['exclude_from_user_name'] = validate($exclude_from_user_name, "mysql-literal");
$esc['mysql']['from_user_description'] = validate($from_user_description, "mysql-literal");
$esc['mysql']['from_user_lang'] = validate($from_user_lang, "mysql-literal");
$esc['mysql']['lang'] = validate($lang, "mysql-literal");

$esc['shell']['dataset'] = validate($dataset, "shell");
$esc['shell']['query'] = validate($query, "shell");
Expand All @@ -798,6 +823,7 @@ function validate_all_variables() {
$esc['shell']['exclude_from_user_name'] = validate($exclude_from_user_name, "shell");
$esc['shell']['from_user_description'] = validate($from_user_description, "shell");
$esc['shell']['from_user_lang'] = validate($from_user_lang, "shell");
$esc['shell']['lang'] = validate($lang, "shell");
$esc['shell']['datasetname'] = validate($dataset, "shell");

$esc['shell']['minf'] = validate($minf, 'frequency');
Expand Down Expand Up @@ -877,7 +903,7 @@ function get_status($variable) {
return null;
}

// Output format: {dataset}-{startdate}-{enddate}-{query}-{exclude}-{from_user_name}-{exclude_from_user_name}-{from_user_description}-{from_user_lang}-{url_query}-{media_url_query}--{module_name}-{module_settings}-{hash}.{filetype}
// Output format: {dataset}-{startdate}-{enddate}-{query}-{exclude}-{from_user_name}-{exclude_from_user_name}-{from_user_description}-{from_user_lang}-{lang}-{url_query}-{media_url_query}--{module_name}-{module_settings}-{hash}.{filetype}
function get_filename_for_export($module, $settings = "", $filetype = "csv") {
global $resultsdir, $esc;

Expand All @@ -901,6 +927,7 @@ function get_filename_for_export($module, $settings = "", $filetype = "csv") {
$filename .= "-" . substr($esc['shell']["exclude_from_user_name"],0,20);
$filename .= "-" . $esc['shell']["from_user_description"];
$filename .= "-" . $esc['shell']["from_user_lang"];
$filename .= "-" . $esc['shell']["lang"];
$filename .= "-" . $esc['shell']["url_query"];
$filename .= "-" . $esc['shell']["media_url_query"];
$filename .= "-" . str_replace(",", "_", str_replace(" ", "x", $esc['shell']["geo_query"]));
Expand Down
23 changes: 21 additions & 2 deletions analysis/index.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ function sendUrl(_file) {
"&exclude=" + $("#ipt_exclude").val().replace(/#/g,"%23") +
"&from_user_name=" + $("#ipt_from_user").val() +
"&from_user_lang=" + $("#ipt_user_lang").val() +
"&lang=" + $("#ipt_lang").val() +
"&exclude_from_user_name=" + $("#ipt_exclude_from_user").val() +
"&from_user_description=" + $("#ipt_user_bio").val() +
"&from_source=" + $("#ipt_from_source").val().replace(/#/g,"%23") +
Expand Down Expand Up @@ -95,6 +96,10 @@ function askFrequency() {
var minf = parseInt(prompt("Specify the minimum frequency for data to be included in the export:","2"), 10);
return minf;
}
function askMediaFrequency() {
var minf = parseInt(prompt("Specify the minimum frequency for data to be included in the export:","0"), 10);
return minf;
}
function askRetweetFrequency() {
var minf = parseInt(prompt("Specify the minimum times a tweet should be retweeted for it to be included in the export:","4"), 10);
return minf;
Expand Down Expand Up @@ -256,6 +261,9 @@ function getExportSettings() {
<tr>
<td class="tbl_head">User language: </td><td><input type="text" id="ipt_user_lang" size="60" name="from_user_lang" value="<?php echo $from_user_lang; ?>" /> (empty: any language*)</td>
</tr>
<tr>
<td class="tbl_head">Tweet language: </td><td><input type="text" id="ipt_lang" size="60" name="lang" value="<?php echo $lang; ?>" /> (empty: any language*)</td>
</tr>

<tr>
<td class="tbl_head">Twitter client URL/descr: </td><td><input type="text" id="ipt_from_source" size="60" name="from_source" value="<?php echo $from_source; ?>" /> (empty: from any client*)</td>
Expand Down Expand Up @@ -610,7 +618,7 @@ function updatestatus() {

<legend>Export selected data</legend>

<p class="txt_desc">All exports have the following filename convention: {dataset}-{startdate}-{enddate}-{query}-{exclude}-{from_user_name}-{exclude_from_user_name}-{from_user_lang}-{url_query}-{media_url_query}--{module_name}-{module_settings}-{dmi-tcat_version}.{filetype}</p>
<p class="txt_desc">All exports have the following filename convention: {dataset}-{startdate}-{enddate}-{query}-{exclude}-{from_user_name}-{exclude_from_user_name}-{from_user_lang}-{lang}-{url_query}-{media_url_query}--{module_name}-{module_settings}-{dmi-tcat_version}.{filetype}</p>

<p>
<div class='txt_desc' style='background-color: #eee; padding: 5px;'>Output format for tables:
Expand Down Expand Up @@ -759,7 +767,7 @@ function updatestatus() {
<h3>Media frequency</h3>
<div class="txt_desc">Contains media URLs and the number of times they have been used.</div>
<div class="txt_desc">Use: get a grasp of the most popular media.</div>
<div class="txt_link"> &raquo; <a href="" onclick="var minf = askFrequency(); $('#whattodo').val('media_frequency&minf='+minf+getInterval());sendUrl('mod.media_frequency.php');return false;">launch</a></div>
<div class="txt_link"> &raquo; <a href="" onclick="var minf = askMediaFrequency(); $('#whattodo').val('media_frequency&minf='+minf+getInterval());sendUrl('mod.media_frequency.php');return false;">launch</a></div>

<!-- <hr/> -->

Expand Down Expand Up @@ -964,6 +972,17 @@ function updatestatus() {

<hr />

<hr />
<h3>Bipartite Host-user graph</h3>
<div class="txt_desc">Produces a <a href="http://en.wikipedia.org/wiki/Bipartite_graph">bipartite graph</a> based on co-occurence of hostnames and users. If a user wrote a tweet with a certain hostname, there will be a link between that user and the hostname.
The more often they appear together, the stronger the link ("<a href="http://en.wikipedia.org/wiki/Weighted_graph#Weighted_graphs_and_networks">link weight</a>").</div>
<div class="txt_desc">Use: explore the relations between users and hosts, find and analyze which users group around which hosts.</div>
<div class="txt_link"> &raquo; <a href="" onclick="$('#whattodo').val('host_user');sendUrl('mod.host_user.php');return false;">launch</a></div>

<hr />



<h3>Bipartite hashtag-URL graph</h3>
<div class="txt_desc">Creates a .csv file that contains URLs and the number of times they have co-occured with a particular hashtag.</div>
<div class="txt_desc">Creates a .gexf file that contains a <a href="http://en.wikipedia.org/wiki/Bipartite_graph">bipartite graph</a> (.gexf, open in gephi) based on co-occurence of URLs and hashtags. If a URL co-occurs with a certain hashtag, there will be a link between that URL and the hashtag.
Expand Down
4 changes: 2 additions & 2 deletions analysis/mod.export_tweets.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
$module = "geoTweets";
$filename = get_filename_for_export($module, implode("_", $exportSettings));
$stream_to_open = export_start($filename, $outputformat);

$csv = new CSV($stream_to_open, $outputformat);

// write header
Expand Down Expand Up @@ -59,7 +59,7 @@
$rec->execute();
while ($data = $rec->fetch(PDO::FETCH_ASSOC)) {
$csv->newrow();
if (preg_match("/_urls/", $sql))
if (preg_match("/_urls/", $sql) || preg_match("/_media/", $sql) || preg_match("/_mentions/", $sql))
$id = $data['tweet_id'];
else
$id = $data['id'];
Expand Down
121 changes: 121 additions & 0 deletions analysis/mod.host_user.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
<?php
require_once __DIR__ . '/common/config.php';
require_once __DIR__ . '/common/functions.php';
require_once __DIR__ . '/common/Gexf.class.php';
require_once __DIR__ . '/common/CSV.class.php';
?>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>TCAT :: Host user co-occurence</title>

<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />

<link rel="stylesheet" href="css/main.css" type="text/css" />

<script type="text/javascript" language="javascript">



</script>

</head>

<body>

<h1>TCAT :: Host user co-occurence</h1>

<?php
validate_all_variables();
dataset_must_exist();
$dbh = pdo_connect();
pdo_unbuffered($dbh);
$collation = current_collation();
$filename = get_filename_for_export("hostUser");
$csv = new CSV($filename, $outputformat);

$sql = "SELECT COUNT(LOWER(t.from_user_name COLLATE $collation)) AS frequency, LOWER(t.from_user_name COLLATE $collation) AS username, u.domain AS domain FROM ";
$sql .= $esc['mysql']['dataset'] . "_urls u, " . $esc['mysql']['dataset'] . "_tweets t ";
$where = "t.id = u.tweet_id AND u.url_followed !='' AND ";
$sql .= sqlSubset($where);
$sql .= " GROUP BY u.domain, LOWER(t.from_user_name) ORDER BY frequency DESC";
$csv->writeheader(array("frequency", "user", "domain"));
$rec = $dbh->prepare($sql);
$rec->execute();
while ($res = $rec->fetch(PDO::FETCH_ASSOC)) {
$csv->newrow();
$csv->addfield($res['frequency']);
$csv->addfield($res['username']);
$csv->addfield($res['domain']);
$csv->writerow();
$urlUsernames[$res['domain']][$res['username']] = $res['frequency'];
//$urlDomain[$res['url']] = $res['domain'];
//$urlStatusCode[$res['url']] = $res['status_code'];
}
$csv->close();

echo '<fieldset class="if_parameters">';

echo '<legend>Your spreadsheet (CSV) file</legend>';

echo '<p><a href="' . str_replace("#", urlencode("#"), str_replace("\"", "%22", $filename)) . '">' . $filename . '</a></p>';

echo '</fieldset>';

$userUniqueUrls = array(); $userTotalUrls = array();
$urlUniqueUsers = array(); $urlTotalUsers = array();

foreach ($urlUsernames as $url => $usernames) {
if (!isset($urlUniqueUsers[$url])) $urlUniqueUsers[$url] = 0;
if (!isset($urlTotalUsers[$url])) $urlTotalUsers[$url] = 0;
foreach ($usernames as $username => $frequency) {
if (!isset($userUniqueUrls[$username])) $userUniqueUrls[$username] = 0;
if (!isset($userTotalUrls[$username])) $userTotalUrls[$username] = 0;
$urlUniqueUsers[$url]++;
$urlTotalUsers[$url] += $frequency;
$userUniqueUrls[$username]++;
$userTotalUrls[$username] += $frequency;
}
}

$gexf = new Gexf();
$gexf->setTitle("Host-user " . $filename);
$gexf->setEdgeType(GEXF_EDGE_UNDIRECTED);
$gexf->setCreator("tools.digitalmethods.net");
foreach ($urlUsernames as $url => $usernames) {
foreach ($usernames as $username => $frequency) {
$node1 = new GexfNode($url);
$node1->addNodeAttribute("type", 'domain', $type = "string");
$node1->addNodeAttribute('longlabel', $url, $type = "string");
$node1->addNodeAttribute('unique_users', $urlUniqueUsers[$url], $type = "integer");
$node1->addNodeAttribute('total_users', $urlTotalUsers[$url], $type = "integer");
$gexf->addNode($node1);
$node2 = new GexfNode($username);
$node2->addNodeAttribute("type", 'user', $type = "string");
$node2->addNodeAttribute('longlabel', $username, $type = "string");
$node2->addNodeAttribute('unique_domains', $userUniqueUrls[$username], $type = "integer");
$node2->addNodeAttribute('total_domains', $userTotalUrls[$username], $type = "integer");

$gexf->addNode($node2);
$edge_id = $gexf->addEdge($node1, $node2, $frequency);
}
}

$gexf->render();

$filename = get_filename_for_export("hostUser", '', 'gexf');
file_put_contents($filename, $gexf->gexfFile);

echo '<fieldset class="if_parameters">';

echo '<legend>Your network (GEXF) file</legend>';

echo '<p><a href="' . filename_to_url($filename) . '">' . $filename . '</a></p>';

echo '</fieldset>';
?>

</body>
</html>
3 changes: 3 additions & 0 deletions analysis/mod.media_frequency.php
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@
foreach ($media_url_count as $datepart => $url_count) {
arsort($url_count);
foreach ($url_count as $url => $count) {
if ($minf > 0 && $count < $minf) {
continue;
}
$csv->newrow();
$csv->addfield($datepart);
$csv->addfield($url);
Expand Down

0 comments on commit e75ca2b

Please sign in to comment.