Skip to content
This repository has been archived by the owner on Jan 22, 2022. It is now read-only.

Latest commit

 

History

History
940 lines (905 loc) · 16.4 KB

dataExplorationScript.md

File metadata and controls

940 lines (905 loc) · 16.4 KB
import boto3
import botocore
import json
import pandas as pd
import utils.load_data_util

# Pandas Display Settings to allow the dataframe to display in one view
pd.set_option('display.max_columns', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 50000)
s3 = boto3.resource('s3')
# Helper function to trim the json files into a proper json format
def process_string(data):
    return "[" + data[1:-1] + "]"

#Helper function to count the occurance of a given key
def count_key(data, key, key_value_count):
    for site in data :
        key_value = site[key]
        key_value_count[key_value] = key_value_count.get(key_value, 0) + 1
result = utils.load_data_util.load_random_data(50)
unique_args = result.arguments.unique()
count = 0
with open("uniqueArgs.txt", "wb") as f:
    for arg in unique_args:
        count += 1
        f.write((str(arg)+"\n").encode("utf-8"))
grouped_by_symbol = result.groupby(['symbol']).count()
grouped_by_symbol
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
arguments call_stack crawl_id file_number func_name in_iframe location operation script_col script_line script_loc_eval script_url time_stamp value
symbol
CanvasRenderingContext2D.fillRect 1 1 1 1 1 1 1 1 1 1 1 1 1 1
CanvasRenderingContext2D.fillStyle 0 2 2 2 2 2 2 2 2 2 2 2 2 2
CanvasRenderingContext2D.textBaseline 0 1 1 1 1 1 1 1 1 1 1 1 1 1
HTMLCanvasElement.getContext 3 3 3 3 3 3 3 3 3 3 3 3 3 3
HTMLCanvasElement.height 0 1 1 1 1 1 1 1 1 1 1 1 1 1
HTMLCanvasElement.style 0 1 1 1 1 1 1 1 1 1 1 1 1 1
HTMLCanvasElement.width 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.iceGatheringState 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.idpLoginUrl 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.localDescription 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.onicecandidate 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.onremovestream 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.peerIdentity 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.remoteDescription 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.signalingState 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.Storage.getItem 182 182 182 182 182 182 182 182 182 182 182 182 182 182
window.Storage.key 3 3 3 3 3 3 3 3 3 3 3 3 3 3
window.Storage.length 0 5 5 5 5 5 5 5 5 5 5 5 5 5
window.Storage.removeItem 35 35 35 35 35 35 35 35 35 35 35 35 35 35
window.Storage.setItem 49 49 49 49 49 49 49 49 49 49 49 49 49 49
window.document.cookie 0 479 479 479 479 479 479 479 479 479 479 479 479 479
window.localStorage 0 94 94 94 94 94 94 94 94 94 94 94 94 94
window.name 0 31 31 31 31 31 31 31 31 31 31 31 31 31
window.navigator.appCodeName 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.navigator.appName 0 20 20 20 20 20 20 20 20 20 20 20 20 20
window.navigator.appVersion 0 1 1 1 1 1 1 1 1 1 1 1 1 1
window.navigator.cookieEnabled 0 14 14 14 14 14 14 14 14 14 14 14 14 14
window.navigator.language 0 21 21 21 21 21 21 21 21 21 21 21 21 21
window.navigator.mimeTypes[application/futuresplash].type 0 4 4 4 4 4 4 4 4 4 4 4 4 4
window.navigator.mimeTypes[application/x-shockwave-flash].type 0 3 3 3 3 3 3 3 3 3 3 3 3 3
window.navigator.onLine 0 1 1 1 1 1 1 1 1 1 1 1 1 1
window.navigator.platform 0 23 23 23 23 23 23 23 23 23 23 23 23 23
window.navigator.plugins[Shockwave Flash].description 0 39 39 39 39 39 39 39 39 39 39 39 39 39
window.navigator.plugins[Shockwave Flash].filename 0 7 7 7 7 7 7 7 7 7 7 7 7 7
window.navigator.plugins[Shockwave Flash].length 0 9 9 9 9 9 9 9 9 9 9 9 9 9
window.navigator.plugins[Shockwave Flash].name 0 10 10 10 10 10 10 10 10 10 10 10 10 10
window.navigator.plugins[Shockwave Flash].version 0 7 7 7 7 7 7 7 7 7 7 7 7 7
window.navigator.product 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.navigator.productSub 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.navigator.userAgent 0 258 258 258 258 258 258 258 258 258 258 258 258 258
window.navigator.vendor 0 7 7 7 7 7 7 7 7 7 7 7 7 7
window.navigator.vendorSub 0 1 1 1 1 1 1 1 1 1 1 1 1 1
window.screen.colorDepth 0 22 22 22 22 22 22 22 22 22 22 22 22 22
window.screen.pixelDepth 0 5 5 5 5 5 5 5 5 5 5 5 5 5
window.sessionStorage 0 65 65 65 65 65 65 65 65 65 65 65 65 65
result.corr()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
crawl_id file_number in_iframe
crawl_id NaN NaN NaN
file_number NaN 1.000000 0.137485
in_iframe NaN 0.137485 1.000000