Skip to content

Latest commit

 

History

History
225 lines (171 loc) · 8.06 KB

readme.md

File metadata and controls

225 lines (171 loc) · 8.06 KB

Keep calm and comply with GDPR

GDPR Fines

h/t to Bob Rudis for sharing the data source, and to Roel Hogervorst for the guide to scraping this data. He provided the bulk of the scraping code, and I added bit of additional data cleaning. The data this week comes from Privacy Affairs.

I have also included all the raw text (gdpr_text.tsv) for the actual GDPR legal documents, in case someone was interested in parsing through them or using them along with the violations.

Per Wikipedia GDPR is:

The General Data Protection Regulation (EU) 2016/679 (GDPR) is a regulation in EU law on data protection and privacy in the European Union (EU) and the European Economic Area (EEA). It also addresses the transfer of personal data outside the EU and EEA areas. The GDPR aims primarily to give control to individuals over their personal data and to simplify the regulatory environment for international business by unifying the regulation within the EU.[1] Superseding the Data Protection Directive 95/46/EC, the regulation contains provisions and requirements related to the processing of personal data of individuals (formally called data subjects in the GDPR) who reside in the EEA, and applies to any enterprise—regardless of its location and the data subjects' citizenship or residence—that is processing the personal information of data subjects inside the EEA.

Get the data here

# Get the Data

gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')

# Or read in with tidytuesdayR package (https://github.com/thebioengineer/tidytuesdayR)
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020.

# Either ISO-8601 date or year/week works!

# Install via devtools::install_github("thebioengineer/tidytuesdayR")

tuesdata <- tidytuesdayR::tt_load('2020-04-21')
tuesdata <- tidytuesdayR::tt_load(2020, week = 17)


gdpr_violations <- tuesdata$gdpr_violations

Data Dictionary

gdpr_violations.tsv

variable class description
id integer Idetifier for fine/violation
picture character SVG image of violation country flag
name character Name of country where violation was enforced
price integer Fine price in Euros (€)
authority character Authority that enacted the violation
date character Date of violation
controller character Controller of data - the violator
article_violated character Specific GDPR Article violated (see the gdpr_text.tsv data for specifics)
type character Type of violation
source character Original source (URL) of fine data
summary character Summary of violation

gdpr_text.tsv

variable class description
chapter double GDPR Chapter Number
chapter_title character Chapter title
article double GDPR Article number
article_title character Article title
sub_article double Sub article number
gdpr_text character Raw text of article/subarticle
href character URL to the raw text itself

Cleaning Script

library(tidyverse)
library(rvest)

# Note the following code was adapted from
# https://blog.rmhogervorst.nl/blog/2020/04/08/scraping-gdpr-fines/

link <- "https://www.privacyaffairs.com/gdpr-fines/"
page <- read_html(link)


temp <- page %>% html_nodes("script") %>% 
  .[9] %>% 
  rvest::html_text() 

ends <- str_locate_all(temp, "\\]")
starts <- str_locate_all(temp, "\\[")

table1 <- temp %>% 
  stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>% 
  str_remove_all("\\\n") %>% 
  str_remove_all("\\\r") %>%
  jsonlite::fromJSON() %>% 
  as_tibble() %>% 
  mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))


table2 <- temp %>% 
  stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>% 
  str_remove_all("\\\n") %>% 
  str_remove_all("\\\r") %>%  
  jsonlite::fromJSON() %>% 
  as_tibble() %>% 
  mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))


all_df <- bind_rows(table1, table2) %>% 
  janitor::clean_names() %>%
  mutate(
    authority = str_remove(authority, "\t"),
    article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% 
           str_remove('</a>'),
    article_violated = str_replace_all(article_violated, ", Art", "|Art"),
    type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% 
      str_remove('</a>')
           )

# most frequent articles violated
all_df %>% 
  separate_rows(article_violated, sep = "\\|") %>% 
  count(article_violated, sort = T)

all_df %>% 
  write_tsv("2020/2020-04-21/gdpr_violations.tsv")


# Getting the actual article text -----------------------------------------

raw_article <- "https://gdpr-info.eu/" %>% 
  read_html()

# Get all the urls for specific articles/chapters
gdpr_href <- raw_article %>% 
  html_node(xpath = '//*[@id="tablepress-12"]') %>% 
  html_nodes("a") %>% 
  html_attr("href")

# pull the titles as well
gdpr_titles <- raw_article %>% 
  html_node(xpath = '//*[@id="tablepress-12"]') %>% 
  html_nodes("a") %>% 
  html_attr("data-title")

# pull the numbers of article/chapters
gdpr_numbers <- raw_article %>% 
  html_node(xpath = '//*[@id="tablepress-12"]') %>% 
  html_nodes("a") %>% 
  html_text()

# put it all into a df
gdpr_df <- tibble(
  article = gdpr_numbers,
  title = str_trim(gdpr_titles),
  href = gdpr_href
) 

# Tidy up the data, create chapters vs articles
clean_gdpr <- gdpr_df %>% 
  mutate(chapter = if_else(str_length(article) > 3, article, NA_character_),
         chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>% 
  fill(chapter, chapter_title) %>% 
  filter(!str_detect(article, "Chapter")) %>% 
  mutate(article = as.double(article)) %>% 
  filter(!is.na(article)) %>% 
  select(starts_with("chapter"), article, article_title = title, href)

clean_gdpr

# LONG running outcome
# Get all the raw html from each of the urls for each article
all_articles <- clean_gdpr %>% 
  mutate(raw_html = map(href, read_html))

# function to take raw html and turn it into text for that specific article
get_gdpr_text <- function(html_in){
  
  test_var <- html_in %>% 
    html_node(".entry-content") %>% 
    html_nodes("ol") %>% 
    html_text()
  
  if (length(test_var) == 0){
   text <- html_in %>%
     html_node(".entry-content > p") %>% 
     html_text() %>% 
     str_remove("^[:digit:]") 
  } else {
    text <- html_in %>% 
      html_node(".entry-content") %>% 
      html_nodes("ol") %>% 
      html_text() %>% 
      .[[1]] %>% 
      str_replace_all(";\n", "\t") %>% 
      str_replace_all(":\n", "\t") %>% 
      str_split("\n") %>% 
      .[[1]] %>% 
      .[. != ""] %>% 
      str_replace_all("\t", "\n") %>% 
      str_remove("^[:digit:]")
  }
  
  
  text
    
}

# Test
get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/"))

# unnest the list column of text
clean_articles <- all_articles %>% 
  mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>% 
  unnest_longer(gdpr_text)

# final dataframe
final_articles <- clean_articles %>% 
  group_by(article) %>% 
  mutate(sub_article = row_number()) %>% 
  relocate(sub_article, .after = "article_title") %>% 
  relocate(gdpr_text, .after = "sub_article") %>% 
  ungroup() %>% 
  mutate(chapter = str_extract(chapter, "[:digit:]+")) %>% 
  mutate_at(vars(chapter, article, sub_article), as.double) %>% 
  select(-raw_html)

final_articles %>% view()

write_tsv(final_articles, "2020/2020-04-21/gdpr_text.tsv")