-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoi_extraction.Rmd
66 lines (52 loc) · 1.29 KB
/
doi_extraction.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
---
title: "DOI extraction and matching"
output: github_document
---
```{r, echo = FALSE, message = FALSE, warning = FALSE}
knitr::opts_chunk$set(
comment = "#>",
collapse = TRUE,
warning = FALSE,
message = FALSE,
echo = TRUE
)
```
```{r}
library(tidyverse)
library(stringi)
library(biblids) # https://github.com/subugoe/biblids
```
```{r}
npl_df <- readr::read_csv("data/bq_doi_20210824.csv")
```
extract dois
```{r}
npl_tt <- npl_df %>%
mutate(doi = biblids::str_extract_all_doi(npl_text)) %>%
select(doi, publication_number)
npl_tidy <- tibble(
as.data.frame(npl_tt$doi),
publication_number = npl_tt$publication_number) %>%
pivot_longer(!publication_number) %>%
filter(!is.na(value)) %>%
select(-name) %>%
mutate(doi_cleaned = str_remove(value, "\\.$")) %>%
mutate(doi_cleaned = str_remove(doi_cleaned, "\\>$")) %>%
mutate(doi_cleaned = str_remove(doi_cleaned, "\\,$")) %>%
mutate(doi_cleaned = str_remove(doi_cleaned, "\\;$")) %>%
select(-value)
npl_tidy
npl_tidy %>%
write_csv("data/dois_to_be_checked.csv")
```
Upload to Google Big Query
```{r}
library(bigrquery)
patent_dois <-
bq_table("api-project-764811344545", "tmp", "patent_dois")
if(bq_table_exists(patent_dois))
bq_table_delete(patent_dois)
bigrquery::bq_table_upload(
patent_dois,
npl_tidy)
```