diff --git a/CITATION.cff b/CITATION.cff index 6ef872c..a5f31af 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -28,5 +28,5 @@ keywords: url: "https://github.com/CWTSLeiden/CWTS-OpenAlex-databases#readme" repository-code: "https://github.com/CWTSLeiden/CWTS-OpenAlex-databases" license: MIT -version: 2023nov -date-released: 2024-01-30 +version: 2024aug +date-released: 2024-10-02 diff --git a/data/validation/json_paths_authors.txt b/data/validation/json_paths_authors.txt index 5808c1b..c9795f3 100644 --- a/data/validation/json_paths_authors.txt +++ b/data/validation/json_paths_authors.txt @@ -1,3 +1,12 @@ +affiliations[seq].institution.country_code +affiliations[seq].institution.country_id +affiliations[seq].institution.display_name +affiliations[seq].institution.id +affiliations[seq].institution.lineage[seq] +affiliations[seq].institution.ror +affiliations[seq].institution.type +affiliations[seq].institution.type_id +affiliations[seq].years[seq] cited_by_count counts_by_year[seq].cited_by_count counts_by_year[seq].oa_works_count @@ -19,6 +28,14 @@ last_known_institution.id last_known_institution.lineage[seq] last_known_institution.ror last_known_institution.type +last_known_institutions[seq].country_code +last_known_institutions[seq].country_id +last_known_institutions[seq].display_name +last_known_institutions[seq].id +last_known_institutions[seq].lineage[seq] +last_known_institutions[seq].ror +last_known_institutions[seq].type +last_known_institutions[seq].type_id most_cited_work orcid summary_stats.2yr_cited_by_count @@ -31,6 +48,24 @@ summary_stats.h_index summary_stats.i10_index summary_stats.oa_percent summary_stats.works_count +topic_share[seq].display_name +topic_share[seq].domain.display_name +topic_share[seq].domain.id +topic_share[seq].field.display_name +topic_share[seq].field.id +topic_share[seq].id +topic_share[seq].subfield.display_name +topic_share[seq].subfield.id +topic_share[seq].value +topics[seq].count +topics[seq].display_name +topics[seq].domain.display_name +topics[seq].domain.id +topics[seq].field.display_name +topics[seq].field.id +topics[seq].id +topics[seq].subfield.display_name +topics[seq].subfield.id updated updated_date works_api_url diff --git a/data/validation/json_paths_domains.txt b/data/validation/json_paths_domains.txt new file mode 100644 index 0000000..9ac0f86 --- /dev/null +++ b/data/validation/json_paths_domains.txt @@ -0,0 +1,16 @@ +cited_by_count +created_date +description +display_name +display_name_alternatives[seq] +fields[seq].display_name +fields[seq].id +id +ids.wikidata +ids.wikipedia +siblings[seq].display_name +siblings[seq].id +updated +updated_date +works_api_url +works_count diff --git a/data/validation/json_paths_fields.txt b/data/validation/json_paths_fields.txt new file mode 100644 index 0000000..81131c3 --- /dev/null +++ b/data/validation/json_paths_fields.txt @@ -0,0 +1,19 @@ +cited_by_count +created_date +description +display_name +display_name_alternatives +display_name_alternatives[seq] +domain.display_name +domain.id +id +ids.wikidata +ids.wikipedia +siblings[seq].display_name +siblings[seq].id +subfields[seq].display_name +subfields[seq].id +updated +updated_date +works_api_url +works_count diff --git a/data/validation/json_paths_funders.txt b/data/validation/json_paths_funders.txt index fbfb689..eeaa2d0 100644 --- a/data/validation/json_paths_funders.txt +++ b/data/validation/json_paths_funders.txt @@ -1,6 +1,7 @@ alternate_titles[seq] cited_by_count country_code +country_id counts_by_year[seq].cited_by_count counts_by_year[seq].oa_works_count counts_by_year[seq].works_count diff --git a/data/validation/json_paths_institutions.txt b/data/validation/json_paths_institutions.txt index 9267a30..864f573 100644 --- a/data/validation/json_paths_institutions.txt +++ b/data/validation/json_paths_institutions.txt @@ -1,12 +1,15 @@ associated_institutions[seq].country_code +associated_institutions[seq].country_id associated_institutions[seq].display_name associated_institutions[seq].id associated_institutions[seq].lineage[seq] associated_institutions[seq].relationship associated_institutions[seq].ror associated_institutions[seq].type +associated_institutions[seq].type_id cited_by_count country_code +country_id counts_by_year[seq].cited_by_count counts_by_year[seq].oa_works_count counts_by_year[seq].works_count @@ -33,23 +36,24 @@ ids.wikipedia image_thumbnail_url image_url international.display_name[skip] +is_super_system lineage[seq] relationship repositories[seq].display_name -repositories[seq].host_institution_lineage[seq] -repositories[seq].host_institution_lineage_names[seq] repositories[seq].host_organization repositories[seq].host_organization_lineage[seq] repositories[seq].host_organization_lineage_names[seq] repositories[seq].host_organization_name repositories[seq].id +repositories[seq].is_core repositories[seq].is_in_doaj repositories[seq].is_oa repositories[seq].issn +repositories[seq].issn[seq] repositories[seq].issn_l repositories[seq].publisher -repositories[seq].publisher_id repositories[seq].type +repositories[seq].type_id roles[seq].id roles[seq].role roles[seq].works_count @@ -64,7 +68,26 @@ summary_stats.h_index summary_stats.i10_index summary_stats.oa_percent summary_stats.works_count +topic_share[seq].display_name +topic_share[seq].domain.display_name +topic_share[seq].domain.id +topic_share[seq].field.display_name +topic_share[seq].field.id +topic_share[seq].id +topic_share[seq].subfield.display_name +topic_share[seq].subfield.id +topic_share[seq].value +topics[seq].count +topics[seq].display_name +topics[seq].domain.display_name +topics[seq].domain.id +topics[seq].field.display_name +topics[seq].field.id +topics[seq].id +topics[seq].subfield.display_name +topics[seq].subfield.id type +type_id updated updated_date works_api_url diff --git a/data/validation/json_paths_sources.txt b/data/validation/json_paths_sources.txt index 1bd2bfa..f31519d 100644 --- a/data/validation/json_paths_sources.txt +++ b/data/validation/json_paths_sources.txt @@ -7,6 +7,7 @@ apc_prices[seq].price apc_usd cited_by_count country_code +country_id counts_by_year[seq].cited_by_count counts_by_year[seq].oa_works_count counts_by_year[seq].works_count @@ -14,8 +15,6 @@ counts_by_year[seq].year created_date display_name homepage_url -host_institution_lineage[seq] -host_institution_lineage_names[seq] host_organization host_organization_lineage[seq] host_organization_lineage_names[seq] @@ -27,6 +26,7 @@ ids.issn_l ids.mag ids.openalex ids.wikidata +is_core is_in_doaj is_oa issn @@ -49,7 +49,26 @@ summary_stats.h_index summary_stats.i10_index summary_stats.oa_percent summary_stats.works_count +topic_share[seq].display_name +topic_share[seq].domain.display_name +topic_share[seq].domain.id +topic_share[seq].field.display_name +topic_share[seq].field.id +topic_share[seq].id +topic_share[seq].subfield.display_name +topic_share[seq].subfield.id +topic_share[seq].value +topics[seq].count +topics[seq].display_name +topics[seq].domain.display_name +topics[seq].domain.id +topics[seq].field.display_name +topics[seq].field.id +topics[seq].id +topics[seq].subfield.display_name +topics[seq].subfield.id type +type_id updated updated_date works_api_url diff --git a/data/validation/json_paths_subfields.txt b/data/validation/json_paths_subfields.txt new file mode 100644 index 0000000..4ba0bfb --- /dev/null +++ b/data/validation/json_paths_subfields.txt @@ -0,0 +1,20 @@ +cited_by_count +created_date +description +display_name +display_name_alternatives[seq] +domain.display_name +domain.id +field.display_name +field.id +id +ids.wikidata +ids.wikipedia +siblings[seq].display_name +siblings[seq].id +topics[seq].display_name +topics[seq].id +updated +updated_date +works_api_url +works_count diff --git a/data/validation/json_paths_topics.txt b/data/validation/json_paths_topics.txt new file mode 100644 index 0000000..4aff7ba --- /dev/null +++ b/data/validation/json_paths_topics.txt @@ -0,0 +1,20 @@ +cited_by_count +created_date +description +display_name +domain.display_name +domain.id +field.display_name +field.id +id +ids.openalex +ids.wikipedia +keywords[seq] +siblings[seq].display_name +siblings[seq].id +subfield.display_name +subfield.id +updated +updated_date +works_api_url +works_count diff --git a/data/validation/json_paths_works.txt b/data/validation/json_paths_works.txt index 781cbba..ca16d77 100644 --- a/data/validation/json_paths_works.txt +++ b/data/validation/json_paths_works.txt @@ -10,17 +10,22 @@ apc_paid.provenance apc_paid.value apc_paid.value_usd authors_count +authorships[seq].affiliations[seq].institution_ids[seq] +authorships[seq].affiliations[seq].raw_affiliation_string authorships[seq].author.display_name authorships[seq].author.id authorships[seq].author.orcid authorships[seq].author_position authorships[seq].countries[seq] +authorships[seq].country_ids[seq] authorships[seq].institutions[seq].country_code +authorships[seq].institutions[seq].country_id authorships[seq].institutions[seq].display_name authorships[seq].institutions[seq].id authorships[seq].institutions[seq].lineage[seq] authorships[seq].institutions[seq].ror authorships[seq].institutions[seq].type +authorships[seq].institutions[seq].type_id authorships[seq].is_corresponding authorships[seq].raw_affiliation_string authorships[seq].raw_affiliation_strings[seq] @@ -33,6 +38,7 @@ best_oa_location.is_oa best_oa_location.is_published best_oa_location.landing_page_url best_oa_location.license +best_oa_location.license_id best_oa_location.pdf_url best_oa_location.source best_oa_location.source.display_name @@ -43,6 +49,7 @@ best_oa_location.source.host_organization_lineage[seq] best_oa_location.source.host_organization_lineage_names[seq] best_oa_location.source.host_organization_name best_oa_location.source.id +best_oa_location.source.is_core best_oa_location.source.is_in_doaj best_oa_location.source.is_oa best_oa_location.source.issn @@ -53,14 +60,18 @@ best_oa_location.source.publisher_id best_oa_location.source.publisher_lineage[seq] best_oa_location.source.publisher_lineage_names[seq] best_oa_location.source.type +best_oa_location.source.type_id best_oa_location.version biblio.first_page biblio.issue biblio.last_page biblio.volume +citation_normalized_percentile +citation_normalized_percentile.is_in_top_10_percent +citation_normalized_percentile.is_in_top_1_percent +citation_normalized_percentile.value cited_by_api_url cited_by_count -cited_by_percentile_year cited_by_percentile_year.max cited_by_percentile_year.min concepts[seq].display_name @@ -75,10 +86,16 @@ countries_distinct_count counts_by_year[seq].cited_by_count counts_by_year[seq].year created_date +datasets[seq] display_name doi doi_registration_agency +domains[seq].display_name +domains[seq].id +fields[seq].display_name +fields[seq].id fulltext_origin +fwci grants[seq].award_id grants[seq].funder grants[seq].funder_display_name @@ -90,18 +107,23 @@ ids.mag ids.openalex ids.pmcid ids.pmid +indexed_in[seq] institutions_distinct_count is_paratext is_retracted +keywords[seq].display_name +keywords[seq].id keywords[seq].keyword keywords[seq].score language +language_id locations[seq].doi locations[seq].is_accepted locations[seq].is_oa locations[seq].is_published locations[seq].landing_page_url locations[seq].license +locations[seq].license_id locations[seq].pdf_url locations[seq].source locations[seq].source.display_name @@ -112,6 +134,7 @@ locations[seq].source.host_organization_lineage[seq] locations[seq].source.host_organization_lineage_names[seq] locations[seq].source.host_organization_name locations[seq].source.id +locations[seq].source.is_core locations[seq].source.is_in_doaj locations[seq].source.is_oa locations[seq].source.issn @@ -122,6 +145,7 @@ locations[seq].source.publisher_id locations[seq].source.publisher_lineage[seq] locations[seq].source.publisher_lineage_names[seq] locations[seq].source.type +locations[seq].source.type_id locations[seq].version locations_count mesh[seq].descriptor_name @@ -140,6 +164,7 @@ primary_location.is_oa primary_location.is_published primary_location.landing_page_url primary_location.license +primary_location.license_id primary_location.pdf_url primary_location.source primary_location.source.display_name @@ -150,6 +175,7 @@ primary_location.source.host_organization_lineage[seq] primary_location.source.host_organization_lineage_names[seq] primary_location.source.host_organization_name primary_location.source.id +primary_location.source.is_core primary_location.source.is_in_doaj primary_location.source.is_oa primary_location.source.issn @@ -160,19 +186,44 @@ primary_location.source.publisher_id primary_location.source.publisher_lineage[seq] primary_location.source.publisher_lineage_names[seq] primary_location.source.type +primary_location.source.type_id primary_location.version +primary_topic +primary_topic.display_name +primary_topic.domain.display_name +primary_topic.domain.id +primary_topic.field.display_name +primary_topic.field.id +primary_topic.id +primary_topic.score +primary_topic.subfield.display_name +primary_topic.subfield.id publication_date publication_year referenced_works[seq] referenced_works_count related_works[seq] +subfields[seq].display_name +subfields[seq].id summary_stats.2yr_cited_by_count summary_stats.cited_by_count sustainable_development_goals[seq].display_name sustainable_development_goals[seq].id sustainable_development_goals[seq].score title +topics[seq].display_name +topics[seq].domain.display_name +topics[seq].domain.id +topics[seq].field.display_name +topics[seq].field.id +topics[seq].id +topics[seq].score +topics[seq].subfield.display_name +topics[seq].subfield.id +topics_count type type_crossref +type_id updated updated_date +versions[seq] diff --git a/data/validation/openalex_2023nov_authors_json_row_count_tables.tsv b/data/validation/openalex_2023nov_authors_json_row_count_tables.tsv deleted file mode 100644 index c3e92f6..0000000 --- a/data/validation/openalex_2023nov_authors_json_row_count_tables.tsv +++ /dev/null @@ -1,3 +0,0 @@ -_file_information 89520174 -author 89520174 -author_display_name_alternative 122147700 diff --git a/data/validation/openalex_2023nov_core_row_count_tables.tsv b/data/validation/openalex_2023nov_core_row_count_tables.tsv deleted file mode 100644 index 0871679..0000000 --- a/data/validation/openalex_2023nov_core_row_count_tables.tsv +++ /dev/null @@ -1,2 +0,0 @@ -source 248650 -work 246328942 diff --git a/data/validation/openalex_2023nov_funders_json_row_count_tables.tsv b/data/validation/openalex_2023nov_funders_json_row_count_tables.tsv deleted file mode 100644 index d737c5e..0000000 --- a/data/validation/openalex_2023nov_funders_json_row_count_tables.tsv +++ /dev/null @@ -1,4 +0,0 @@ -_file_information 32437 -funder 32437 -funder_alternate_title 71555 -funder_role 53106 diff --git a/data/validation/openalex_2023nov_indicators_row_count_tables.tsv b/data/validation/openalex_2023nov_indicators_row_count_tables.tsv deleted file mode 100644 index 6a54a81..0000000 --- a/data/validation/openalex_2023nov_indicators_row_count_tables.tsv +++ /dev/null @@ -1,14 +0,0 @@ -citation 895072472 -classification_system 2 -collaboration_type 3 -database 2 -database_classification_system_research_area_mean_n_cits 10897212 -database_classification_system_research_area_source_mcs_mncs_pp_uncited 843827850 -database_classification_system_research_area_source_pp_top_prop 5062967100 -database_classification_system_research_area_top_threshold 65383272 -database_classification_system_source_pp_top_n_cits 274427664 -doc_type 3 -pub 45108798 -pub_classification_system_research_area 89636364 -pub_database 81224383 -source 218333 diff --git a/data/validation/openalex_2023nov_institutions_json_row_count_tables.tsv b/data/validation/openalex_2023nov_institutions_json_row_count_tables.tsv deleted file mode 100644 index 232f4b5..0000000 --- a/data/validation/openalex_2023nov_institutions_json_row_count_tables.tsv +++ /dev/null @@ -1,9 +0,0 @@ -_file_information 107246 -institution 107246 -institution_associated_institution 41579 -institution_display_name_acronym 46020 -institution_display_name_alternative 29635 -institution_international_display_name 354420 -institution_lineage 127496 -institution_repository 2876 -institution_role 126144 diff --git a/data/validation/openalex_2023nov_publishers_json_row_count_tables.tsv b/data/validation/openalex_2023nov_publishers_json_row_count_tables.tsv deleted file mode 100644 index aab828f..0000000 --- a/data/validation/openalex_2023nov_publishers_json_row_count_tables.tsv +++ /dev/null @@ -1,6 +0,0 @@ -_file_information 10250 -publisher 10250 -publisher_alternate_title 4653 -publisher_country_code 8138 -publisher_lineage 10570 -publisher_role 15552 diff --git a/data/validation/openalex_2023nov_row_count_tables.tsv b/data/validation/openalex_2023nov_row_count_tables.tsv deleted file mode 100644 index fac1967..0000000 --- a/data/validation/openalex_2023nov_row_count_tables.tsv +++ /dev/null @@ -1,75 +0,0 @@ -_author 89520174 -_concept 65073 -_funder 32437 -_institution 107246 -_publisher 10250 -_source 248643 -_work 246328942 -apc_provenance 2 -author 89656349 -author_alternative_name 122147700 -author_position 3 -citation 1891737625 -city 15007 -concept 65073 -concept_ancestor 419354 -concept_international_description 663341 -concept_international_name 1821994 -concept_related 1059247 -concept_umls_aui 15783 -concept_umls_cui 32074 -country 251 -doi_registration_agency 10 -fulltext_origin 2 -funder 32437 -funder_alternative_name 71555 -funder_publisher 2955 -institution 107261 -institution_acronym 46020 -institution_alternative_name 29635 -institution_associated 41408 -institution_funder 17855 -institution_international_name 354420 -institution_lineage 127315 -institution_name 21068202 -institution_publisher 3560 -institution_relationship_type 4 -institution_repository 2876 -institution_type 8 -keyword 71019724 -license 20 -mesh_descriptor 30182 -mesh_qualifier 76 -oa_status 5 -publisher 10255 -publisher_alternative_name 4653 -publisher_country 8138 -raw_affiliation_string 90088716 -raw_author_name 93576522 -region 370 -source 248650 -source_alternative_title 88196 -source_apc_price 38088 -source_issn 210597 -source_society 4550 -source_type 6 -sustainable_development_goal 17 -version 3 -work 246328942 -work_abstract 126651021 -work_author 631858769 -work_author_country 328821973 -work_author_institution 380437686 -work_author_raw_affiliation_string 374637399 -work_concept 2185335170 -work_detail 246328942 -work_grant 17464912 -work_institution 166944922 -work_keyword 547180830 -work_location 311693498 -work_mesh 461393162 -work_reference 1891952103 -work_related 2228936387 -work_sustainable_development_goal 517732603 -work_title 244098671 -work_type 30 diff --git a/data/validation/openalex_2023nov_sources_json_row_count_tables.tsv b/data/validation/openalex_2023nov_sources_json_row_count_tables.tsv deleted file mode 100644 index afcd4b9..0000000 --- a/data/validation/openalex_2023nov_sources_json_row_count_tables.tsv +++ /dev/null @@ -1,9 +0,0 @@ -_file_information 248643 -source 248643 -source_alternate_title 88196 -source_apc_price 38088 -source_host_institution_lineage 2890 -source_host_organization_lineage 76270 -source_issn 210597 -source_publisher_lineage 83105 -source_society 4550 diff --git a/data/validation/openalex_2023nov_text_row_count_tables.tsv b/data/validation/openalex_2023nov_text_row_count_tables.tsv deleted file mode 100644 index e5777b3..0000000 --- a/data/validation/openalex_2023nov_text_row_count_tables.tsv +++ /dev/null @@ -1 +0,0 @@ -text_data 244322442 diff --git a/data/validation/openalex_2023nov_works_json_row_count_tables.tsv b/data/validation/openalex_2023nov_works_json_row_count_tables.tsv deleted file mode 100644 index 170bf51..0000000 --- a/data/validation/openalex_2023nov_works_json_row_count_tables.tsv +++ /dev/null @@ -1,19 +0,0 @@ -_file_information 246328942 -work 246328942 -work_abstract 126651022 -work_authorship 631858769 -work_authorship_country 328821973 -work_authorship_institution 380438324 -work_authorship_raw_affiliation_string 374637399 -work_best_oa_location 50925095 -work_concept 2185335170 -work_corresponding_author_id 131795736 -work_corresponding_institution_id 64191673 -work_grant 17464912 -work_keyword 547180830 -work_location 311693545 -work_mesh 461393162 -work_primary_location 244437821 -work_referenced_work 1896234121 -work_related_work 2322693788 -work_sustainable_development_goal 517732603 diff --git a/data/validation/openalex_2023nov_authors_json_data_types.tsv b/data/validation/openalex_2024aug_authors_json_data_types.tsv similarity index 53% rename from data/validation/openalex_2023nov_authors_json_data_types.tsv rename to data/validation/openalex_2024aug_authors_json_data_types.tsv index 4c04edb..ce2896a 100644 --- a/data/validation/openalex_2023nov_authors_json_data_types.tsv +++ b/data/validation/openalex_2024aug_authors_json_data_types.tsv @@ -9,12 +9,25 @@ author display_name nvarchar 100 author id char 32 author updated_date varchar 26 author works_count int 10 -author id_orcid char 37 +author id_orcid varchar 37 author id_scopus nvarchar 110 author id_twitter varchar 100 author id_wikipedia varchar 68 author last_known_institution_id varchar 32 +author_affiliation folder smallint 5 +author_affiliation record_id int 10 +author_affiliation affiliation_seq smallint 5 +author_affiliation institution_id varchar 32 +author_affiliation_year folder smallint 5 +author_affiliation_year record_id int 10 +author_affiliation_year affiliation_seq smallint 5 +author_affiliation_year year_seq smallint 5 +author_affiliation_year year smallint 5 author_display_name_alternative folder smallint 5 author_display_name_alternative record_id int 10 author_display_name_alternative display_name_alternative_seq smallint 5 author_display_name_alternative display_name_alternative nvarchar 100 +author_last_known_institution folder smallint 5 +author_last_known_institution record_id int 10 +author_last_known_institution last_known_institution_seq smallint 5 +author_last_known_institution id varchar 32 diff --git a/data/validation/openalex_2024aug_authors_json_row_count_tables.tsv b/data/validation/openalex_2024aug_authors_json_row_count_tables.tsv new file mode 100644 index 0000000..1ef0f5a --- /dev/null +++ b/data/validation/openalex_2024aug_authors_json_row_count_tables.tsv @@ -0,0 +1,6 @@ +_file_information 95724450 +author 95724447 +author_affiliation 88214118 +author_affiliation_year 179863422 +author_display_name_alternative 131700041 +author_last_known_institution 46153129 diff --git a/data/validation/openalex_2023nov_classification_data_types.tsv b/data/validation/openalex_2024aug_classification_data_types.tsv similarity index 98% rename from data/validation/openalex_2023nov_classification_data_types.tsv rename to data/validation/openalex_2024aug_classification_data_types.tsv index 96a1182..37f288f 100644 --- a/data/validation/openalex_2023nov_classification_data_types.tsv +++ b/data/validation/openalex_2024aug_classification_data_types.tsv @@ -7,8 +7,6 @@ cluster_labeling long_label varchar 1000 cluster_labeling keywords varchar 99999 cluster_labeling summary varchar 99999 cluster_labeling wikipedia_url varchar 200 -cluster_pub_titles cluster_no int 10 -cluster_pub_titles pub_titles nvarchar 99999 clustering work_id bigint 20 clustering macro_cluster_id smallint 5 clustering meso_cluster_id smallint 5 @@ -147,7 +145,7 @@ pub work_id bigint 20 pub n_refs_covered int 10 pub core_pub bit 1 pub pub_no bigint 20 -pub_cluster pub_no int 10 +pub_cluster pub_no bigint 20 pub_cluster micro_cluster_no int 10 pub_cluster meso_cluster_no int 10 pub_cluster macro_cluster_no int 10 diff --git a/data/validation/openalex_2023nov_classification_row_count_tables.tsv b/data/validation/openalex_2024aug_classification_row_count_tables.tsv similarity index 53% rename from data/validation/openalex_2023nov_classification_row_count_tables.tsv rename to data/validation/openalex_2024aug_classification_row_count_tables.tsv index 741485c..9f5260b 100644 --- a/data/validation/openalex_2023nov_classification_row_count_tables.tsv +++ b/data/validation/openalex_2024aug_classification_row_count_tables.tsv @@ -1,7 +1,6 @@ -cit_link 3430284582 +cit_link 4197701018 cluster_labeling 4521 -cluster_pub_titles 4521 -clustering 70674439 +clustering 77886862 level0_concept_main_field 19 level1_concept_main_field 284 macro_cluster 20 @@ -12,20 +11,20 @@ map_macro_clusters 20 map_meso_clusters 917 map_micro_clusters 4521 meso_cluster 917 -meso_cluster_level0_concept 1700 -meso_cluster_level1_concept 1740 -meso_cluster_main_field 1138 +meso_cluster_level0_concept 1699 +meso_cluster_level1_concept 1714 +meso_cluster_main_field 1143 meso_cluster_source 9170 meso_clusters 917 micro_cluster 4521 micro_cluster_keyword 45204 -micro_cluster_level0_concept 9659 -micro_cluster_level1_concept 8827 -micro_cluster_main_field 5945 +micro_cluster_level0_concept 9689 +micro_cluster_level1_concept 8788 +micro_cluster_main_field 5949 micro_cluster_source 45210 micro_clusters 4521 network_macro_clusters 380 -network_meso_clusters 695285 -network_micro_clusters 5659635 -pub 218665949 -pub_cluster 91424757 +network_meso_clusters 739270 +network_micro_clusters 6919565 +pub 225412634 +pub_cluster 103404374 diff --git a/data/validation/openalex_2023nov_concepts_json_data_types.tsv b/data/validation/openalex_2024aug_concepts_json_data_types.tsv similarity index 98% rename from data/validation/openalex_2023nov_concepts_json_data_types.tsv rename to data/validation/openalex_2024aug_concepts_json_data_types.tsv index 036e1c2..df106ef 100644 --- a/data/validation/openalex_2023nov_concepts_json_data_types.tsv +++ b/data/validation/openalex_2024aug_concepts_json_data_types.tsv @@ -1,5 +1,5 @@ _file_information folder char 8 -_file_information file varchar 11 +_file_information file varchar 10 _file_information record_id int 10 concept folder char 8 concept record_id int 10 diff --git a/data/validation/openalex_2023nov_concepts_json_row_count_tables.tsv b/data/validation/openalex_2024aug_concepts_json_row_count_tables.tsv similarity index 63% rename from data/validation/openalex_2023nov_concepts_json_row_count_tables.tsv rename to data/validation/openalex_2024aug_concepts_json_row_count_tables.tsv index 4b0bc8f..962e726 100644 --- a/data/validation/openalex_2023nov_concepts_json_row_count_tables.tsv +++ b/data/validation/openalex_2024aug_concepts_json_row_count_tables.tsv @@ -3,6 +3,6 @@ concept 65073 concept_ancestor 419354 concept_id_umls_aui 15783 concept_id_umls_cui 32074 -concept_international_description 663341 -concept_international_display_name 1821995 +concept_international_description 666333 +concept_international_display_name 1824868 concept_related_concept 1059247 diff --git a/data/validation/openalex_2023nov_core_data_types.tsv b/data/validation/openalex_2024aug_core_data_types.tsv similarity index 100% rename from data/validation/openalex_2023nov_core_data_types.tsv rename to data/validation/openalex_2024aug_core_data_types.tsv diff --git a/data/validation/openalex_2024aug_core_row_count_tables.tsv b/data/validation/openalex_2024aug_core_row_count_tables.tsv new file mode 100644 index 0000000..5b3edf0 --- /dev/null +++ b/data/validation/openalex_2024aug_core_row_count_tables.tsv @@ -0,0 +1,2 @@ +source 254553 +work 258602038 diff --git a/data/validation/openalex_2023nov_data_types.tsv b/data/validation/openalex_2024aug_data_types.tsv similarity index 73% rename from data/validation/openalex_2023nov_data_types.tsv rename to data/validation/openalex_2024aug_data_types.tsv index 5ae5954..45675de 100644 --- a/data/validation/openalex_2023nov_data_types.tsv +++ b/data/validation/openalex_2024aug_data_types.tsv @@ -4,6 +4,12 @@ _author record_id int 10 _concept concept_id bigint 20 _concept folder varchar 12 _concept record_id int 10 +_domain domain_id tinyint 3 +_domain folder varchar 12 +_domain record_id int 10 +_field field_id tinyint 3 +_field folder varchar 12 +_field record_id int 10 _funder funder_id bigint 20 _funder folder varchar 12 _funder record_id int 10 @@ -16,6 +22,12 @@ _publisher record_id int 10 _source source_id bigint 20 _source folder varchar 12 _source record_id int 10 +_subfield subfield_id smallint 5 +_subfield folder varchar 12 +_subfield record_id int 10 +_topic topic_id smallint 5 +_topic folder varchar 12 +_topic record_id int 10 _work work_id bigint 20 _work folder smallint 5 _work record_id int 10 @@ -23,7 +35,6 @@ apc_provenance apc_provenance_id tinyint 3 apc_provenance apc_provenance varchar 20 author author_id bigint 20 author author nvarchar 99999 -author last_known_institution_id bigint 20 author orcid char 19 author openalex_id varchar 11 author scopus_id bigint 20 @@ -33,6 +44,16 @@ author created_date datetime2 27 author_alternative_name author_id bigint 20 author_alternative_name alternative_name_seq smallint 5 author_alternative_name alternative_name nvarchar 255 +author_institution author_id bigint 20 +author_institution institution_seq smallint 5 +author_institution institution_id bigint 20 +author_institution_year author_id bigint 20 +author_institution_year institution_seq smallint 5 +author_institution_year year_seq smallint 5 +author_institution_year year smallint 5 +author_last_known_institution author_id bigint 20 +author_last_known_institution last_known_institution_seq smallint 5 +author_last_known_institution last_known_institution_id bigint 20 author_position author_position_id tinyint 3 author_position author_position varchar 6 citation citing_work_id bigint 20 @@ -59,7 +80,7 @@ concept_ancestor concept_id bigint 20 concept_ancestor ancestor_concept_seq smallint 5 concept_ancestor ancestor_concept_id bigint 20 concept_international_description concept_id bigint 20 -concept_international_description language_code varchar 11 +concept_international_description language_code varchar 16 concept_international_description concept_international_description nvarchar 800 concept_international_name concept_id bigint 20 concept_international_name language_code varchar 11 @@ -76,8 +97,45 @@ concept_umls_cui umls_cui_seq smallint 5 concept_umls_cui umls_cui char 8 country country_iso_alpha2_code char 2 country country varchar 50 +data_source data_source_id int 10 +data_source data_source varchar 20 doi_registration_agency doi_registration_agency_id tinyint 3 doi_registration_agency doi_registration_agency varchar 20 +domain domain_id tinyint 3 +domain domain nvarchar 120 +domain description nvarchar 250 +domain openalex_id tinyint 3 +domain wikidata_id varchar 10 +domain wikipedia_url varchar 180 +domain updated_date date 23 +domain created_date datetime2 27 +domain_alternative_name domain_id tinyint 3 +domain_alternative_name alternative_name_seq smallint 5 +domain_alternative_name alternative_name nvarchar 255 +domain_field domain_id tinyint 3 +domain_field field_seq smallint 5 +domain_field field_id tinyint 3 +domain_sibling domain_id tinyint 3 +domain_sibling sibling_domain_seq smallint 5 +domain_sibling sibling_domain_id tinyint 3 +field field_id tinyint 3 +field field nvarchar 120 +field description nvarchar 250 +field openalex_id tinyint 3 +field wikidata_id varchar 10 +field wikipedia_url varchar 180 +field domain_id tinyint 3 +field updated_date date 23 +field created_date datetime2 27 +field_alternative_name field_id tinyint 3 +field_alternative_name alternative_name_seq smallint 5 +field_alternative_name alternative_name nvarchar 255 +field_sibling field_id tinyint 3 +field_sibling sibling_field_seq smallint 5 +field_sibling sibling_field_id tinyint 3 +field_subfield field_id tinyint 3 +field_subfield subfield_seq smallint 5 +field_subfield subfield_id smallint 5 fulltext_origin fulltext_origin_id tinyint 3 fulltext_origin fulltext_origin varchar 20 funder funder_id bigint 20 @@ -107,6 +165,7 @@ institution geonames_city_id int 10 institution latitude float 309 institution longitude float 309 institution homepage_url varchar 600 +institution is_super_system bit 1 institution ror_id varchar 9 institution grid_id varchar 13 institution openalex_id varchar 11 @@ -131,13 +190,11 @@ institution_funder institution_id bigint 20 institution_funder funder_seq smallint 5 institution_funder funder_id bigint 20 institution_international_name institution_id bigint 20 -institution_international_name language_code varchar 11 +institution_international_name language_code varchar 16 institution_international_name institution_international_name nvarchar 200 institution_lineage institution_id bigint 20 institution_lineage lineage_institution_seq smallint 5 institution_lineage lineage_institution_id bigint 20 -institution_name institution_name_id int 10 -institution_name institution_name nvarchar 800 institution_publisher institution_id bigint 20 institution_publisher publisher_seq smallint 5 institution_publisher publisher_id bigint 20 @@ -149,7 +206,7 @@ institution_repository repository_source_id bigint 20 institution_type institution_type_id smallint 5 institution_type institution_type varchar 10 keyword keyword_id int 10 -keyword keyword nvarchar 1200 +keyword keyword nvarchar 200 license license_id tinyint 3 license license varchar 60 mesh_descriptor mesh_descriptor_ui varchar 10 @@ -157,7 +214,7 @@ mesh_descriptor mesh_descriptor varchar 120 mesh_qualifier mesh_qualifier_ui varchar 10 mesh_qualifier mesh_qualifier varchar 40 oa_status oa_status_id tinyint 3 -oa_status oa_status varchar 6 +oa_status oa_status varchar 10 publisher publisher_id bigint 20 publisher publisher nvarchar 200 publisher hierarchy_level smallint 5 @@ -216,9 +273,44 @@ source_society society nvarchar 500 source_society homepage_url varchar 250 source_type source_type_id smallint 5 source_type source_type varchar 14 +subfield subfield_id smallint 5 +subfield subfield nvarchar 120 +subfield description nvarchar 250 +subfield openalex_id smallint 5 +subfield wikidata_id varchar 10 +subfield wikipedia_url varchar 180 +subfield domain_id tinyint 3 +subfield field_id tinyint 3 +subfield updated_date date 23 +subfield created_date datetime2 27 +subfield_alternative_name subfield_id smallint 5 +subfield_alternative_name alternative_name_seq smallint 5 +subfield_alternative_name alternative_name nvarchar 255 +subfield_sibling subfield_id smallint 5 +subfield_sibling sibling_subfield_seq smallint 5 +subfield_sibling sibling_subfield_id smallint 5 +subfield_topic subfield_id smallint 5 +subfield_topic topic_seq smallint 5 +subfield_topic topic_id smallint 5 sustainable_development_goal sustainable_development_goal_id tinyint 3 sustainable_development_goal sustainable_development_goal varchar 40 sustainable_development_goal taxonomy_url varchar 30 +topic topic_id smallint 5 +topic topic nvarchar 120 +topic description nvarchar 1000 +topic openalex_id smallint 5 +topic wikipedia_url varchar 180 +topic domain_id tinyint 3 +topic field_id tinyint 3 +topic subfield_id smallint 5 +topic updated_date date 23 +topic created_date datetime2 27 +topic_keyword topic_id smallint 5 +topic_keyword keyword_seq smallint 5 +topic_keyword keyword nvarchar 100 +topic_sibling topic_id smallint 5 +topic_sibling sibling_topic_seq smallint 5 +topic_sibling sibling_topic_id smallint 5 version version_id tinyint 3 version version varchar 16 work work_id bigint 20 @@ -260,12 +352,22 @@ work updated_date date 23 work created_date datetime2 27 work_abstract work_id bigint 20 work_abstract abstract nvarchar 99999 +work_affiliation work_id bigint 20 +work_affiliation affiliation_seq smallint 5 +work_affiliation raw_affiliation_string_id int 10 +work_affiliation_institution work_id bigint 20 +work_affiliation_institution affiliation_seq smallint 5 +work_affiliation_institution institution_seq smallint 5 +work_affiliation_institution institution_id bigint 20 work_author work_id bigint 20 work_author author_seq smallint 5 work_author author_id bigint 20 work_author author_position_id tinyint 3 work_author is_corresponding_author bit 1 work_author raw_author_name_id int 10 +work_author_affiliation work_id bigint 20 +work_author_affiliation author_seq smallint 5 +work_author_affiliation affiliation_seq smallint 5 work_author_country work_id bigint 20 work_author_country author_seq smallint 5 work_author_country country_seq smallint 5 @@ -281,6 +383,9 @@ work_concept work_id bigint 20 work_concept concept_seq smallint 5 work_concept concept_id bigint 20 work_concept score float 309 +work_data_source work_id bigint 20 +work_data_source data_source_seq smallint 5 +work_data_source data_source_id int 10 work_detail work_id bigint 20 work_detail author_first nvarchar 1000 work_detail author_et_al nvarchar 800 @@ -292,9 +397,9 @@ work_detail pub_year smallint 5 work_detail volume nvarchar 100 work_detail issue nvarchar 80 work_detail pages nvarchar 350 -work_detail doi varchar 350 +work_detail doi varchar 330 work_detail pmid int 10 -work_detail work_type varchar 20 +work_detail work_type varchar 25 work_detail n_cits int 10 work_detail n_self_cits int 10 work_grant work_id bigint 20 @@ -304,7 +409,6 @@ work_grant funder_id bigint 20 work_institution work_id bigint 20 work_institution institution_seq smallint 5 work_institution institution_id bigint 20 -work_institution institution_name_id int 10 work_keyword work_id bigint 20 work_keyword keyword_seq smallint 5 work_keyword keyword_id int 10 @@ -338,5 +442,10 @@ work_sustainable_development_goal sustainable_development_goal_id tinyint 3 work_sustainable_development_goal score float 309 work_title work_id bigint 20 work_title title nvarchar 99999 +work_topic work_id bigint 20 +work_topic topic_seq smallint 5 +work_topic topic_id smallint 5 +work_topic score float 309 +work_topic is_primary_topic bit 1 work_type work_type_id tinyint 3 -work_type work_type varchar 20 +work_type work_type varchar 25 diff --git a/data/validation/openalex_2024aug_domains_json_data_types.tsv b/data/validation/openalex_2024aug_domains_json_data_types.tsv new file mode 100644 index 0000000..3c66cb0 --- /dev/null +++ b/data/validation/openalex_2024aug_domains_json_data_types.tsv @@ -0,0 +1,26 @@ +_file_information folder char 7 +_file_information file char 8 +_file_information record_id smallint 5 +domain folder char 7 +domain record_id smallint 5 +domain cited_by_count int 10 +domain created_date datetime 23 +domain description nvarchar 128 +domain display_name varchar 17 +domain id char 30 +domain updated_date char 26 +domain works_count int 10 +domain id_wikidata varchar 39 +domain id_wikipedia varchar 57 +domain_display_name_alternative folder char 7 +domain_display_name_alternative record_id smallint 5 +domain_display_name_alternative display_name_alternative_seq smallint 5 +domain_display_name_alternative display_name_alternative varchar 30 +domain_field folder char 7 +domain_field record_id smallint 5 +domain_field field_seq smallint 5 +domain_field id char 30 +domain_sibling folder char 7 +domain_sibling record_id smallint 5 +domain_sibling sibling_seq smallint 5 +domain_sibling id char 30 diff --git a/data/validation/openalex_2024aug_domains_json_row_count_tables.tsv b/data/validation/openalex_2024aug_domains_json_row_count_tables.tsv new file mode 100644 index 0000000..af5ac9b --- /dev/null +++ b/data/validation/openalex_2024aug_domains_json_row_count_tables.tsv @@ -0,0 +1,5 @@ +_file_information 4 +domain 4 +domain_display_name_alternative 6 +domain_field 26 +domain_sibling 12 diff --git a/data/validation/openalex_2024aug_fields_json_data_types.tsv b/data/validation/openalex_2024aug_fields_json_data_types.tsv new file mode 100644 index 0000000..c706f22 --- /dev/null +++ b/data/validation/openalex_2024aug_fields_json_data_types.tsv @@ -0,0 +1,27 @@ +_file_information folder char 6 +_file_information file varchar 10 +_file_information record_id smallint 5 +field folder char 6 +field record_id smallint 5 +field cited_by_count int 10 +field created_date datetime 23 +field description varchar 197 +field display_name varchar 44 +field id char 30 +field updated_date char 26 +field works_count int 10 +field domain_id char 30 +field id_wikidata varchar 40 +field id_wikipedia varchar 53 +field_display_name_alternative folder char 6 +field_display_name_alternative record_id smallint 5 +field_display_name_alternative display_name_alternative_seq smallint 5 +field_display_name_alternative display_name_alternative varchar 41 +field_sibling folder char 6 +field_sibling record_id smallint 5 +field_sibling sibling_seq smallint 5 +field_sibling id char 30 +field_subfield folder char 6 +field_subfield record_id smallint 5 +field_subfield subfield_seq smallint 5 +field_subfield id char 35 diff --git a/data/validation/openalex_2024aug_fields_json_row_count_tables.tsv b/data/validation/openalex_2024aug_fields_json_row_count_tables.tsv new file mode 100644 index 0000000..1319514 --- /dev/null +++ b/data/validation/openalex_2024aug_fields_json_row_count_tables.tsv @@ -0,0 +1,5 @@ +_file_information 26 +field 26 +field_display_name_alternative 60 +field_sibling 160 +field_subfield 252 diff --git a/data/validation/openalex_2023nov_funders_json_data_types.tsv b/data/validation/openalex_2024aug_funders_json_data_types.tsv similarity index 100% rename from data/validation/openalex_2023nov_funders_json_data_types.tsv rename to data/validation/openalex_2024aug_funders_json_data_types.tsv diff --git a/data/validation/openalex_2024aug_funders_json_row_count_tables.tsv b/data/validation/openalex_2024aug_funders_json_row_count_tables.tsv new file mode 100644 index 0000000..71388c9 --- /dev/null +++ b/data/validation/openalex_2024aug_funders_json_row_count_tables.tsv @@ -0,0 +1,4 @@ +_file_information 32437 +funder 32437 +funder_alternate_title 71621 +funder_role 53265 diff --git a/data/validation/openalex_2023nov_indicators_data_types.tsv b/data/validation/openalex_2024aug_indicators_data_types.tsv similarity index 99% rename from data/validation/openalex_2023nov_indicators_data_types.tsv rename to data/validation/openalex_2024aug_indicators_data_types.tsv index 285b653..4be1edc 100644 --- a/data/validation/openalex_2023nov_indicators_data_types.tsv +++ b/data/validation/openalex_2024aug_indicators_data_types.tsv @@ -63,7 +63,7 @@ pub doc_type_no tinyint 3 pub source_id bigint 20 pub pub_year smallint 5 pub n_authors smallint 5 -pub n_institutes smallint 5 +pub n_institutions smallint 5 pub n_countries smallint 5 pub collaboration_type_no tinyint 3 pub is_industry bit 1 diff --git a/data/validation/openalex_2024aug_indicators_row_count_tables.tsv b/data/validation/openalex_2024aug_indicators_row_count_tables.tsv new file mode 100644 index 0000000..e55ae75 --- /dev/null +++ b/data/validation/openalex_2024aug_indicators_row_count_tables.tsv @@ -0,0 +1,14 @@ +citation 1155225501 +classification_system 2 +collaboration_type 3 +database 2 +database_classification_system_research_area_mean_n_cits 12150874 +database_classification_system_research_area_source_mcs_mncs_pp_uncited 1063741928 +database_classification_system_research_area_source_pp_top_prop 6382451568 +database_classification_system_research_area_top_threshold 72905244 +database_classification_system_source_pp_top_n_cits 349232808 +doc_type 3 +pub 54687111 +pub_classification_system_research_area 105970788 +pub_database 95198287 +source 217477 diff --git a/data/validation/openalex_2023nov_institutions_json_data_types.tsv b/data/validation/openalex_2024aug_institutions_json_data_types.tsv similarity index 96% rename from data/validation/openalex_2023nov_institutions_json_data_types.tsv rename to data/validation/openalex_2024aug_institutions_json_data_types.tsv index 8b22aba..a11869b 100644 --- a/data/validation/openalex_2023nov_institutions_json_data_types.tsv +++ b/data/validation/openalex_2024aug_institutions_json_data_types.tsv @@ -1,5 +1,5 @@ _file_information folder char 12 -_file_information file varchar 11 +_file_information file varchar 12 _file_information record_id int 10 institution folder char 12 institution record_id int 10 @@ -14,6 +14,7 @@ institution image_url varchar 691 institution type varchar 10 institution updated_date char 26 institution works_count int 10 +institution is_super_system varchar 5 institution geo_city nvarchar 58 institution geo_country nvarchar 33 institution geo_country_code char 2 @@ -38,11 +39,11 @@ institution_display_name_acronym display_name_acronym nvarchar 70 institution_display_name_alternative folder char 12 institution_display_name_alternative record_id int 10 institution_display_name_alternative display_name_alternative_seq smallint 5 -institution_display_name_alternative display_name_alternative nvarchar 235 +institution_display_name_alternative display_name_alternative nvarchar 236 institution_international_display_name folder char 12 institution_international_display_name record_id int 10 institution_international_display_name international_display_name_seq smallint 5 -institution_international_display_name international_display_name varchar 9 +institution_international_display_name international_display_name varchar 16 institution_international_display_name display_name nvarchar 191 institution_lineage folder char 12 institution_lineage record_id int 10 diff --git a/data/validation/openalex_2024aug_institutions_json_row_count_tables.tsv b/data/validation/openalex_2024aug_institutions_json_row_count_tables.tsv new file mode 100644 index 0000000..f708572 --- /dev/null +++ b/data/validation/openalex_2024aug_institutions_json_row_count_tables.tsv @@ -0,0 +1,9 @@ +_file_information 109259 +institution 109259 +institution_associated_institution 39363 +institution_display_name_acronym 47926 +institution_display_name_alternative 65898 +institution_international_display_name 366800 +institution_lineage 136499 +institution_repository 2868 +institution_role 128004 diff --git a/data/validation/openalex_2023nov_publishers_json_data_types.tsv b/data/validation/openalex_2024aug_publishers_json_data_types.tsv similarity index 94% rename from data/validation/openalex_2023nov_publishers_json_data_types.tsv rename to data/validation/openalex_2024aug_publishers_json_data_types.tsv index 4450dc1..0391067 100644 --- a/data/validation/openalex_2023nov_publishers_json_data_types.tsv +++ b/data/validation/openalex_2024aug_publishers_json_data_types.tsv @@ -11,8 +11,8 @@ publisher homepage_url varchar 115 publisher id char 32 publisher id_ror char 25 publisher id_wikidata varchar 42 -publisher image_thumbnail_url nvarchar 312 -publisher image_url nvarchar 302 +publisher image_thumbnail_url nvarchar 283 +publisher image_url nvarchar 273 publisher sources_count smallint 5 publisher updated_date varchar 26 publisher works_count int 10 diff --git a/data/validation/openalex_2024aug_publishers_json_row_count_tables.tsv b/data/validation/openalex_2024aug_publishers_json_row_count_tables.tsv new file mode 100644 index 0000000..bce94d4 --- /dev/null +++ b/data/validation/openalex_2024aug_publishers_json_row_count_tables.tsv @@ -0,0 +1,6 @@ +_file_information 10250 +publisher 10250 +publisher_alternate_title 4643 +publisher_country_code 8166 +publisher_lineage 10568 +publisher_role 15658 diff --git a/data/validation/openalex_2024aug_row_count_tables.tsv b/data/validation/openalex_2024aug_row_count_tables.tsv new file mode 100644 index 0000000..a77c4ee --- /dev/null +++ b/data/validation/openalex_2024aug_row_count_tables.tsv @@ -0,0 +1,99 @@ +_author 95724450 +_concept 65073 +_domain 4 +_field 26 +_funder 32437 +_institution 109259 +_publisher 10250 +_source 254515 +_subfield 252 +_topic 4516 +_work 258602038 +apc_provenance 2 +author 95780788 +author_alternative_name 131700041 +author_institution 88214118 +author_institution_year 179863422 +author_last_known_institution 88214118 +author_position 3 +citation 2455846270 +city 15000 +concept 65073 +concept_ancestor 419354 +concept_international_description 666333 +concept_international_name 1824867 +concept_related 1059247 +concept_umls_aui 15783 +concept_umls_cui 32074 +country 251 +data_source 5 +doi_registration_agency 10 +domain 4 +domain_alternative_name 6 +domain_field 26 +domain_sibling 12 +field 26 +field_alternative_name 60 +field_sibling 160 +field_subfield 252 +fulltext_origin 2 +funder 32437 +funder_alternative_name 71621 +funder_publisher 19996 +institution 109290 +institution_acronym 47926 +institution_alternative_name 65898 +institution_associated 39360 +institution_funder 17848 +institution_international_name 366800 +institution_lineage 136496 +institution_publisher 3509 +institution_relationship_type 4 +institution_repository 2868 +institution_type 8 +keyword 72716 +license 24 +mesh_descriptor 30483 +mesh_qualifier 76 +oa_status 6 +publisher 10253 +publisher_alternative_name 4643 +publisher_country 8166 +raw_affiliation_string 129561456 +raw_author_name 104337850 +region 374 +source 254553 +source_alternative_title 85624 +source_apc_price 37547 +source_issn 215158 +source_society 4483 +source_type 7 +subfield 252 +subfield_alternative_name 393 +subfield_sibling 3786 +subfield_topic 4516 +sustainable_development_goal 17 +topic 4516 +topic_keyword 45154 +topic_sibling 243006 +version 3 +work 258602038 +work_abstract 143487283 +work_affiliation 230047121 +work_affiliation_institution 217331431 +work_author 683767734 +work_author_affiliation 466518905 +work_author_country 382426483 +work_concept 2288909011 +work_data_source 204131238 +work_detail 258602038 +work_grant 19608139 +work_keyword 658094518 +work_location 336439216 +work_mesh 480506867 +work_reference 2456591606 +work_related 2298645757 +work_sustainable_development_goal 124764828 +work_title 256377801 +work_topic 520217823 +work_type 36 diff --git a/data/validation/openalex_2023nov_sources_json_data_types.tsv b/data/validation/openalex_2024aug_sources_json_data_types.tsv similarity index 80% rename from data/validation/openalex_2023nov_sources_json_data_types.tsv rename to data/validation/openalex_2024aug_sources_json_data_types.tsv index acf3205..ec7b3d3 100644 --- a/data/validation/openalex_2023nov_sources_json_data_types.tsv +++ b/data/validation/openalex_2024aug_sources_json_data_types.tsv @@ -1,10 +1,10 @@ _file_information folder char 7 -_file_information file varchar 12 +_file_information file varchar 11 _file_information record_id int 10 source folder char 7 source record_id int 10 source abbreviated_title nvarchar 89 -source apc_usd int 10 +source apc_usd smallint 5 source cited_by_count int 10 source country_code char 2 source created_date datetime 23 @@ -14,7 +14,7 @@ source host_organization varchar 32 source host_organization_name nvarchar 155 source id varchar 32 source id_fatcat char 56 -source id_issn_l nvarchar 13 +source id_issn_l char 9 source id_mag bigint 20 source id_wikidata varchar 42 source is_in_doaj varchar 5 @@ -22,6 +22,7 @@ source is_oa varchar 5 source type varchar 14 source updated_date varchar 26 source works_count int 10 +source is_core varchar 5 source_alternate_title folder char 7 source_alternate_title record_id int 10 source_alternate_title alternate_title_seq smallint 5 @@ -31,10 +32,6 @@ source_apc_price record_id int 10 source_apc_price apc_price_seq smallint 5 source_apc_price currency char 3 source_apc_price price int 10 -source_host_institution_lineage folder char 7 -source_host_institution_lineage record_id int 10 -source_host_institution_lineage host_institution_lineage_seq smallint 5 -source_host_institution_lineage host_institution_lineage varchar 32 source_host_organization_lineage folder char 7 source_host_organization_lineage record_id int 10 source_host_organization_lineage host_organization_lineage_seq smallint 5 @@ -42,9 +39,9 @@ source_host_organization_lineage host_organization_lineage varchar 32 source_issn folder char 7 source_issn record_id int 10 source_issn issn_seq smallint 5 -source_issn issn nvarchar 10 +source_issn issn char 9 source_publisher_lineage folder char 7 -source_publisher_lineage record_id int 10 +source_publisher_lineage record_id smallint 5 source_publisher_lineage publisher_lineage_seq smallint 5 source_publisher_lineage publisher_lineage char 32 source_society folder char 7 diff --git a/data/validation/openalex_2024aug_sources_json_row_count_tables.tsv b/data/validation/openalex_2024aug_sources_json_row_count_tables.tsv new file mode 100644 index 0000000..9865a61 --- /dev/null +++ b/data/validation/openalex_2024aug_sources_json_row_count_tables.tsv @@ -0,0 +1,8 @@ +_file_information 254515 +source 254515 +source_alternate_title 85624 +source_apc_price 37547 +source_host_organization_lineage 82303 +source_issn 215158 +source_publisher_lineage 1 +source_society 4483 diff --git a/data/validation/openalex_2024aug_subfields_json_data_types.tsv b/data/validation/openalex_2024aug_subfields_json_data_types.tsv new file mode 100644 index 0000000..4f5a2c4 --- /dev/null +++ b/data/validation/openalex_2024aug_subfields_json_data_types.tsv @@ -0,0 +1,28 @@ +_file_information folder char 9 +_file_information file varchar 10 +_file_information record_id smallint 5 +subfield folder char 9 +subfield record_id smallint 5 +subfield cited_by_count int 10 +subfield created_date datetime 23 +subfield description nvarchar 217 +subfield display_name varchar 53 +subfield id char 35 +subfield updated_date char 26 +subfield works_count int 10 +subfield domain_id char 30 +subfield field_id char 30 +subfield id_wikidata varchar 41 +subfield id_wikipedia varchar 73 +subfield_display_name_alternative folder char 9 +subfield_display_name_alternative record_id smallint 5 +subfield_display_name_alternative display_name_alternative_seq smallint 5 +subfield_display_name_alternative display_name_alternative nvarchar 65 +subfield_sibling folder char 9 +subfield_sibling record_id smallint 5 +subfield_sibling sibling_seq smallint 5 +subfield_sibling id char 35 +subfield_topic folder char 9 +subfield_topic record_id smallint 5 +subfield_topic topic_seq smallint 5 +subfield_topic id char 27 diff --git a/data/validation/openalex_2024aug_subfields_json_row_count_tables.tsv b/data/validation/openalex_2024aug_subfields_json_row_count_tables.tsv new file mode 100644 index 0000000..92a0420 --- /dev/null +++ b/data/validation/openalex_2024aug_subfields_json_row_count_tables.tsv @@ -0,0 +1,5 @@ +_file_information 252 +subfield 252 +subfield_display_name_alternative 393 +subfield_sibling 3786 +subfield_topic 4516 diff --git a/data/validation/openalex_2023nov_text_data_types.tsv b/data/validation/openalex_2024aug_text_data_types.tsv similarity index 100% rename from data/validation/openalex_2023nov_text_data_types.tsv rename to data/validation/openalex_2024aug_text_data_types.tsv diff --git a/data/validation/openalex_2024aug_text_row_count_tables.tsv b/data/validation/openalex_2024aug_text_row_count_tables.tsv new file mode 100644 index 0000000..afa1ccc --- /dev/null +++ b/data/validation/openalex_2024aug_text_row_count_tables.tsv @@ -0,0 +1 @@ +text_data 256854156 diff --git a/data/validation/openalex_2024aug_topics_json_data_types.tsv b/data/validation/openalex_2024aug_topics_json_data_types.tsv new file mode 100644 index 0000000..6651b23 --- /dev/null +++ b/data/validation/openalex_2024aug_topics_json_data_types.tsv @@ -0,0 +1,24 @@ +_file_information folder char 6 +_file_information file varchar 11 +_file_information record_id smallint 5 +topic folder char 6 +topic record_id smallint 5 +topic cited_by_count int 10 +topic created_date datetime 23 +topic description nvarchar 828 +topic display_name nvarchar 102 +topic id char 27 +topic updated_date char 26 +topic works_count int 10 +topic domain_id char 30 +topic field_id char 30 +topic id_wikipedia varchar 120 +topic subfield_id char 35 +topic_keyword folder char 6 +topic_keyword record_id smallint 5 +topic_keyword keyword_seq smallint 5 +topic_keyword keyword nvarchar 76 +topic_sibling folder char 6 +topic_sibling record_id smallint 5 +topic_sibling sibling_seq smallint 5 +topic_sibling id char 27 diff --git a/data/validation/openalex_2024aug_topics_json_row_count_tables.tsv b/data/validation/openalex_2024aug_topics_json_row_count_tables.tsv new file mode 100644 index 0000000..05acbd9 --- /dev/null +++ b/data/validation/openalex_2024aug_topics_json_row_count_tables.tsv @@ -0,0 +1,4 @@ +_file_information 4516 +topic 4516 +topic_keyword 45154 +topic_sibling 243006 diff --git a/data/validation/openalex_2023nov_works_json_data_types.tsv b/data/validation/openalex_2024aug_works_json_data_types.tsv similarity index 72% rename from data/validation/openalex_2023nov_works_json_data_types.tsv rename to data/validation/openalex_2024aug_works_json_data_types.tsv index cb4f75f..15e431c 100644 --- a/data/validation/openalex_2023nov_works_json_data_types.tsv +++ b/data/validation/openalex_2024aug_works_json_data_types.tsv @@ -19,15 +19,16 @@ work language varchar 5 work locations_count smallint 5 work publication_date char 10 work publication_year smallint 5 -work referenced_works_count smallint 5 +work referenced_works_count int 10 work title nvarchar 502 -work type varchar 15 +work type varchar 23 work type_crossref varchar 19 work updated_date varchar 26 +work topics_count smallint 5 work apc_list_currency char 3 work apc_list_provenance char 4 work apc_list_value int 10 -work apc_list_value_usd int 10 +work apc_list_value_usd smallint 5 work apc_paid_currency char 3 work apc_paid_provenance varchar 7 work apc_paid_value int 10 @@ -36,14 +37,14 @@ work biblio_first_page nvarchar 173 work biblio_issue nvarchar 72 work biblio_last_page nvarchar 282 work biblio_volume nvarchar 50 -work id_arxiv_id nvarchar 62 -work id_doi nvarchar 341 +work id_arxiv_id nvarchar 80 +work id_doi nvarchar 321 work id_mag bigint 20 work id_pmcid varchar 49 work id_pmid varchar 42 work open_access_any_repository_has_fulltext varchar 5 work open_access_is_oa varchar 5 -work open_access_oa_status varchar 6 +work open_access_oa_status varchar 7 work open_access_oa_url nvarchar 3521 work_abstract folder smallint 5 work_abstract record_id int 10 @@ -54,9 +55,18 @@ work_authorship authorship_seq smallint 5 work_authorship author_position varchar 6 work_authorship is_corresponding varchar 5 work_authorship raw_author_name nvarchar 99999 -work_authorship raw_affiliation_string nvarchar 99999 -work_authorship author_display_name nvarchar 100 work_authorship author_id char 32 +work_authorship_affiliation folder smallint 5 +work_authorship_affiliation record_id int 10 +work_authorship_affiliation authorship_seq smallint 5 +work_authorship_affiliation affiliation_seq smallint 5 +work_authorship_affiliation raw_affiliation_string nvarchar 99999 +work_authorship_affiliation_institution_id folder smallint 5 +work_authorship_affiliation_institution_id record_id int 10 +work_authorship_affiliation_institution_id authorship_seq smallint 5 +work_authorship_affiliation_institution_id affiliation_seq smallint 5 +work_authorship_affiliation_institution_id institution_id_seq smallint 5 +work_authorship_affiliation_institution_id institution_id varchar 32 work_authorship_country folder smallint 5 work_authorship_country record_id int 10 work_authorship_country authorship_seq smallint 5 @@ -66,7 +76,6 @@ work_authorship_institution folder smallint 5 work_authorship_institution record_id int 10 work_authorship_institution authorship_seq smallint 5 work_authorship_institution institution_seq smallint 5 -work_authorship_institution display_name nvarchar 191 work_authorship_institution id varchar 32 work_authorship_raw_affiliation_string folder smallint 5 work_authorship_raw_affiliation_string record_id int 10 @@ -78,7 +87,8 @@ work_best_oa_location record_id int 10 work_best_oa_location doi nvarchar 311 work_best_oa_location is_oa char 4 work_best_oa_location landing_page_url nvarchar 765 -work_best_oa_location license varchar 37 +work_best_oa_location license_id varchar 51 +work_best_oa_location license varchar 42 work_best_oa_location pdf_url nvarchar 3521 work_best_oa_location version varchar 16 work_best_oa_location is_accepted varchar 5 @@ -98,24 +108,38 @@ work_corresponding_institution_id folder smallint 5 work_corresponding_institution_id record_id int 10 work_corresponding_institution_id corresponding_institution_ids_seq smallint 5 work_corresponding_institution_id corresponding_institution_ids varchar 32 +work_domain folder smallint 5 +work_domain record_id smallint 5 +work_domain domain_seq smallint 5 +work_domain id smallint 5 +work_field folder smallint 5 +work_field record_id smallint 5 +work_field field_seq smallint 5 +work_field id smallint 5 work_grant folder smallint 5 work_grant record_id int 10 work_grant grant_seq smallint 5 work_grant award_id nvarchar 962 work_grant funder char 32 work_grant funder_display_name nvarchar 176 +work_indexed_in folder smallint 5 +work_indexed_in record_id int 10 +work_indexed_in indexed_in_seq smallint 5 +work_indexed_in indexed_in varchar 8 work_keyword folder smallint 5 work_keyword record_id int 10 work_keyword keyword_seq smallint 5 -work_keyword keyword nvarchar 1168 work_keyword score float 309 +work_keyword display_name nvarchar 76 +work_keyword id nvarchar 106 work_location folder smallint 5 work_location record_id int 10 work_location location_seq smallint 5 -work_location doi nvarchar 341 +work_location doi nvarchar 321 work_location is_oa varchar 5 work_location landing_page_url nvarchar 1935 -work_location license varchar 37 +work_location license_id varchar 51 +work_location license varchar 42 work_location pdf_url nvarchar 3521 work_location version varchar 16 work_location is_accepted varchar 5 @@ -132,27 +156,41 @@ work_mesh qualifier_name varchar 29 work_mesh qualifier_ui varchar 10 work_primary_location folder smallint 5 work_primary_location record_id int 10 -work_primary_location doi nvarchar 341 +work_primary_location doi nvarchar 321 work_primary_location is_oa varchar 5 work_primary_location landing_page_url nvarchar 1935 -work_primary_location license varchar 37 +work_primary_location license_id varchar 51 +work_primary_location license varchar 42 work_primary_location pdf_url nvarchar 3521 work_primary_location version varchar 16 work_primary_location is_accepted varchar 5 work_primary_location is_published varchar 5 work_primary_location source_id varchar 32 work_primary_location source_display_name nvarchar 500 +work_primary_topic folder smallint 5 +work_primary_topic record_id int 10 +work_primary_topic id char 27 +work_primary_topic score float 309 work_referenced_work folder smallint 5 work_referenced_work record_id int 10 -work_referenced_work referenced_work_seq smallint 5 +work_referenced_work referenced_work_seq int 10 work_referenced_work referenced_work varchar 32 work_related_work folder smallint 5 work_related_work record_id int 10 work_related_work related_work_seq smallint 5 work_related_work related_work varchar 32 +work_subfield folder smallint 5 +work_subfield record_id smallint 5 +work_subfield subfield_seq smallint 5 +work_subfield id smallint 5 work_sustainable_development_goal folder smallint 5 work_sustainable_development_goal record_id int 10 work_sustainable_development_goal sustainable_development_goal_seq smallint 5 work_sustainable_development_goal display_name varchar 39 work_sustainable_development_goal id varchar 30 work_sustainable_development_goal score float 309 +work_topic folder smallint 5 +work_topic record_id int 10 +work_topic topic_seq smallint 5 +work_topic id char 27 +work_topic score float 309 diff --git a/data/validation/openalex_2024aug_works_json_row_count_tables.tsv b/data/validation/openalex_2024aug_works_json_row_count_tables.tsv new file mode 100644 index 0000000..0da5ae4 --- /dev/null +++ b/data/validation/openalex_2024aug_works_json_row_count_tables.tsv @@ -0,0 +1,27 @@ +_file_information 258602038 +work 258602038 +work_abstract 143492730 +work_authorship 683767734 +work_authorship_affiliation 466602679 +work_authorship_affiliation_institution_id 451887645 +work_authorship_country 382426483 +work_authorship_institution 435324877 +work_authorship_raw_affiliation_string 466611092 +work_best_oa_location 59948268 +work_concept 2288909011 +work_corresponding_author_id 134684128 +work_corresponding_institution_id 64796678 +work_domain 15 +work_field 21 +work_grant 19608139 +work_indexed_in 204131238 +work_keyword 658101903 +work_location 336439261 +work_mesh 480506867 +work_primary_location 256789592 +work_primary_topic 216040467 +work_referenced_work 2473888183 +work_related_work 2402519693 +work_subfield 26 +work_sustainable_development_goal 124764828 +work_topic 520217823 diff --git a/doc/database_diagram.vsdm b/doc/database_diagram.vsdm index 7df90a7..2557528 100644 Binary files a/doc/database_diagram.vsdm and b/doc/database_diagram.vsdm differ diff --git a/doc/documentation.tsv b/doc/documentation.tsv index 2eb3626..91a928d 100644 --- a/doc/documentation.tsv +++ b/doc/documentation.tsv @@ -6,7 +6,6 @@ apc_provenance apc_provenance column APC provenance data source. author table Each row in this table represents an author. For more information, see https://docs.openalex.org/api-entities/authors. author author_id column OpenAlex author identifier (prefix with 'https://openalex.com/A' to get the full identifier). author author column Full name of the author. -author last_known_institution_id column Last known institution affiliation of the author. See [institution] table. author orcid column ORCID author identifier. See ORCID database. author openalex_id column OpenAlex identifier (prefix with 'https://openalex.com/' to get the full identifier). author scopus_id column Scopus author identifier. @@ -17,6 +16,19 @@ author_alternative_name table Each row in this table represents an alternative author_alternative_name author_id column OpenAlex author identifier. See [author] table. author_alternative_name alternative_name_seq column Alternative name sequence number. author_alternative_name alternative_name column An alternative to the name of the author. +author_institution table This table list the institution affiliations provided by an author in his works. +author_institution author_id column OpenAlex author identifier. See [author] table. +author_institution institution_seq column Institution sequence number. +author_institution institution_id column OpenAlex institution identifier of the author's institution. See [institution] table. +author_institution_year table This table list the years in which an author has an institution affiliation. +author_institution_year author_id column OpenAlex author identifier. See [author] table. +author_institution_year institution_seq column Institution sequence number. +author_institution_year year_seq column Year sequence number. +author_institution_year year column Year of the author's institution affiliation. +author_last_known_institution table This table list the institution affiliations provided by an author in his most recent work. +author_last_known_institution author_id column OpenAlex author identifier. See [author] table. +author_last_known_institution last_known_institution_seq column Last known institution sequence number. +author_last_known_institution last_known_institution_id column OpenAlex institution identifier of the author's last known institution. See [institution] table. author_position table Each row in this table represents the position that an author can have in the author list (i.e., 'first', 'middle', or 'last'). author_position author_position_id column CWTS author position identifier. author_position author_position column Author position. @@ -71,9 +83,55 @@ concept_umls_cui umls_cui column Unified Medical Language System Concept Unique country table Each row in this table represents a country. country country_iso_alpha2_code column ISO-standard 3166-1 2-letter code of the country. country country column Country name. +data_source table Each row in this table represents a data source in which the work is indexed (e.g., arXiv, Crossref, DOAJ, or PubMed). +data_source data_source_id column CWTS data source identifier. +data_source data_source column Data source. doi_registration_agency table Each row in this table represents a DOI registration agency. doi_registration_agency doi_registration_agency_id column CWTS DOI registration agency identifier. doi_registration_agency doi_registration_agency column Registration agency. +domain table Each row in this table represents a domain. +domain domain_id column OpenAlex domain identifier. +domain domain column Domain name. +domain description column Domain description. +domain openalex_id column OpenAlex identifier (prefix with 'https://openalex.com/domains/' to get the full identifier). +domain wikidata_id column Wikidata domain identifier. For more information, see https://www.wikidata.org. +domain wikipedia_url column Wikipedia page URL of the domain. +domain updated_date column The date and time anything in the corresponding OpenAlex domain record was last changed. +domain created_date column The date the domain was introduced in the OpenAlex dataset. +domain_alternative_name table Each row in this table represents an alternative to the name of a domain. +domain_alternative_name domain_id column OpenAlex domain identifier. See [domain] table. +domain_alternative_name alternative_name_seq column Alternative name sequence number. +domain_alternative_name alternative_name column An alternative to the name of the domain. +domain_field table This table links domains to fields. +domain_field domain_id column OpenAlex domain identifier. See [domain] table. +domain_field field_seq column Field sequence number. +domain_field field_id column OpenAlex field identifier. See [field] table. +domain_sibling table Each row in this table represents the relation of a domain with its sibling domains. +domain_sibling domain_id column OpenAlex domain identifier. See [domain] table. +domain_sibling sibling_domain_seq column Sibling domain sequence number. +domain_sibling sibling_domain_id column OpenAlex domain identifier of the sibling domain. See [domain] table. +field table Each row in this table represents a field. +field field_id column OpenAlex field identifier. +field field column Field name. +field description column Field description. +field openalex_id column OpenAlex identifier (prefix with 'https://openalex.com/fields/' to get the full identifier). +field wikidata_id column Wikidata field identifier. For more information, see https://www.wikidata.org. +field wikipedia_url column Wikipedia page URL of the field. +field domain_id column OpenAlex domain identifier of the domain to which the field belongs. See [domain] table. +field updated_date column The date and time anything in the corresponding OpenAlex field record was last changed. +field created_date column The date the field was introduced in the OpenAlex dataset. +field_alternative_name table Each row in this table represents an alternative to the name of a field. +field_alternative_name field_id column OpenAlex field identifier. See [field] table. +field_alternative_name alternative_name_seq column Alternative name sequence number. +field_alternative_name alternative_name column An alternative to the name of the field. +field_sibling table Each row in this table represents the relation of a field with its sibling fields. +field_sibling field_id column OpenAlex field identifier. See [field] table. +field_sibling sibling_field_seq column Sibling field sequence number. +field_sibling sibling_field_id column OpenAlex field identifier of the sibling field. See [field] table. +field_subfield table This table links fields to subfields. +field_subfield field_id column OpenAlex field identifier. See [field] table. +field_subfield subfield_seq column Subfield sequence number. +field_subfield subfield_id column OpenAlex subfield identifier. See [subfield] table. fulltext_origin table Each row in this table represents a source used for the full text search (e.g., 'pdf', 'ngrams'). fulltext_origin fulltext_origin_id column CWTS full text origin identifer. fulltext_origin fulltext_origin column Full text origin. @@ -108,6 +166,7 @@ institution geonames_city_id column GeoNames city identifier of the region where institution latitude column Latitude of the institution. institution longitude column Longitude of the institution. institution homepage_url column Homepage URL of the institution. +institution is_super_system column Institution is a super system (0 = no, 1 = yes). This includes large university systems such as the University of California System, as well as some governments and multinational companies. institution ror_id column ROR organization identifier. See ROR database. institution grid_id column GRID organization identifier. See GRID database. institution openalex_id column OpenAlex identifier (prefix with 'https://openalex.com/' to get the full identifier). @@ -143,9 +202,6 @@ institution_lineage table Each row in this table respresents a parent instituti institution_lineage institution_id column OpenAlex institution identifier. See [institution] table. institution_lineage lineage_institution_seq column Lineage institution sequence number. institution_lineage lineage_institution_id column OpenAlex institution identifier of the lineage institution. See [institution] table. -institution_name table Each row in this table represents the name of an institution. -institution_name institution_name_id column CWTS institution name identifier. -institution_name institution_name column Institution name. institution_publisher table This table links institutions to publishers. institution_publisher institution_id column OpenAlex institution identifier. See [institution] table. institution_publisher publisher_seq column Publisher sequence number. @@ -245,10 +301,52 @@ source_society homepage_url column Homepage URL of the society. source_type table Each row in this table represents a type of source (i.e., 'journal', 'repository', 'conference', or 'ebook platform'). source_type source_type_id column CWTS source type identifier. source_type source_type column Source type. +subfield table Each row in this table represents a subfield. +subfield subfield_id column OpenAlex subfield identifier. +subfield subfield column Subfield name. +subfield description column Subfield description. +subfield openalex_id column OpenAlex identifier (prefix with 'https://openalex.com/subfields/' to get the full identifier). +subfield wikidata_id column Wikidata field identifier. For more information, see https://www.wikidata.org. +subfield wikipedia_url column Wikipedia page URL of the subfield. +subfield domain_id column OpenAlex domain identifier of the domain to which the subfield belongs. See [domain] table. +subfield field_id column OpenAlex field identifier of the field to which the subfield belongs. See [field] table. +subfield updated_date column The date and time anything in the corresponding OpenAlex subfield record was last changed. +subfield created_date column The date the subfield was introduced in the OpenAlex dataset. +subfield_alternative_name table Each row in this table represents an alternative to the name of a subfield. +subfield_alternative_name subfield_id column OpenAlex subfield identifier. See [subfield] table. +subfield_alternative_name alternative_name_seq column Alternative name sequence number. +subfield_alternative_name alternative_name column An alternative to the name of the subfield. +subfield_sibling table Each row in this table represents the relation of a subfield with its sibling subfields. +subfield_sibling subfield_id column OpenAlex subfield identifier. See [subfield] table. +subfield_sibling sibling_subfield_seq column Sibling subfield sequence number. +subfield_sibling sibling_subfield_id column OpenAlex subfield identifier of the sibling subfield. See [subfield] table. +subfield_topic table This table links subfields to topics. +subfield_topic subfield_id column OpenAlex subfield identifier. See [subfield] table. +subfield_topic topic_seq column Topic sequence number. +subfield_topic topic_id column OpenAlex topic identifier. See [topic] table. sustainable_development_goal table Each row in this table represents a sustainable development goal. sustainable_development_goal sustainable_development_goal_id column Sustainable development goal identifier. sustainable_development_goal sustainable_development_goal column Sustainable development goal. sustainable_development_goal taxonomy_url column URL of the sustainable development goal. +topic table Each row in this table represents a topic. +topic topic_id column OpenAlex topic identifier. +topic topic column Topic name. +topic description column Topic description. +topic openalex_id column OpenAlex identifier (prefix with 'https://openalex.com/' to get the full identifier). +topic wikipedia_url column Wikidata topic identifier. For more information, see https://www.wikidata.org. +topic domain_id column OpenAlex domain identifier of the domain to which the topic belongs. See [domain] table. +topic field_id column OpenAlex field identifier of the field to which the topic belongs. See [field] table. +topic subfield_id column OpenAlex subfield identifier of the subfield to which the topic belongs. See [subfield] table. +topic updated_date column The date and time anything in the corresponding OpenAlex topic record was last changed. +topic created_date column The date the topic was introduced in the OpenAlex dataset. +topic_keyword table Each row in this table provides a keyword that describes a topic. +topic_keyword topic_id column OpenAlex topic identifier. See [topic] table. +topic_keyword keyword_seq column Keyword sequence number. +topic_keyword keyword column Keyword that describes the topic. +topic_sibling table Each row in this table represents the relation of a topic with its sibling topics. +topic_sibling topic_id column OpenAlex topic identifier. See [topic] table. +topic_sibling sibling_topic_seq column Sibling topic sequence number. +topic_sibling sibling_topic_id column OpenAlex topic identifier of the sibling topic. See [topic] table. version table Each row in this table represents a version that applies to the hosted work (i.e., 'publishedVersion', 'acceptedVersion', or 'submittedVersion'). Based on the DRIVER Guidelines versioning scheme, see https://wiki.surfnet.nl/display/DRIVERguidelines/DRIVER-VERSION+Mappings. version version_id column CWTS version identifier. version version column Version. @@ -293,6 +391,15 @@ work created_date column The date the work was introduced in the OpenAlex datase work_abstract table Each row in this table represents an abstract of a work. work_abstract work_id column OpenAlex work identifier. See [work] table. work_abstract abstract column Abstract of the work. +work_affiliation table This table links works to affiliations. +work_affiliation work_id column OpenAlex work identifier. See [work] table. +work_affiliation affiliation_seq column Affiliation sequence number. +work_affiliation raw_affiliation_string_id column CWTS raw affiliation string identifier. See [raw_affiliation_string] table. +work_affiliation_institution table This table links work affiliations to institutions. +work_affiliation_institution work_id column OpenAlex work identifier. See [work] and [work_affiliation] tables. +work_affiliation_institution affiliation_seq column Affiliation sequence number. See [work_affiliation] table. +work_affiliation_institution institution_seq column Institution sequence number. +work_affiliation_institution institution_id column OpenAlex institution identifier. See [institution] table. work_author table This table links works to authors. work_author work_id column OpenAlex work identifier. See [work] table. work_author author_seq column Author sequence number. @@ -300,25 +407,24 @@ work_author author_id column OpenAlex author identifier. See [author] table. work_author author_position_id column CWTS author position identifier. See [author_position] table. work_author is_corresponding_author column The author is the corresponding author for the work (0 = no, 1 = yes, NULL = either this work has no corresponding author, or information about the corresponding author is missing). work_author raw_author_name_id column CWTS raw author name identifier. See [raw_author_name] table. +work_author_affiliation table This table links the authors and affiliations of works. +work_author_affiliation work_id column OpenAlex work identifier. See [work] and [work_author] and [work_affiliation] tables. +work_author_affiliation author_seq column Author sequence number. See [work_author] table. +work_author_affiliation affiliation_seq column Affiliation sequence number. See [work_affiliation] table. work_author_country table This table links the authors and countries of works. -work_author_country work_id column OpenAlex work identifier. See [work] table. -work_author_country author_seq column Author sequence number. +work_author_country work_id column OpenAlex work identifier. See [work] and [work_author] tables. +work_author_country author_seq column Author sequence number. See [work_author] table. work_author_country country_seq column Country sequence number. work_author_country country_iso_alpha2_code column ISO-standard 3166-1 2-letter code of the country of the author. See [country] table. -work_author_institution table This table links the authors and institutions of works. -work_author_institution work_id column OpenAlex work identifier. See [work] table. -work_author_institution author_seq column Author sequence number. -work_author_institution institution_seq column Institution sequence number. -work_author_raw_affiliation_string table This table links the authors and raw affiliation strings of works. -work_author_raw_affiliation_string work_id column OpenAlex work identifier. See [work] table. -work_author_raw_affiliation_string author_seq column Author sequence number. -work_author_raw_affiliation_string raw_affiliation_string_seq column Raw affiliation string sequence number. -work_author_raw_affiliation_string raw_affiliation_string_id column CWTS raw affiliation string identifier. See [raw_affiliation_string] table. work_concept table This table links works to concepts. work_concept work_id column OpenAlex work identifier. See [work] table. work_concept concept_seq column Concept sequence number. work_concept concept_id column OpenAlex concept identifier. See [concept] table. work_concept score column The strength of the connection between the work and the concept, from 0 to 1 (higher is stronger). +work_data_source table This table links works to data sources. +work_data_source work_id column OpenAlex work identifier. See [work] table. +work_data_source data_source_seq column Data source sequence number. +work_data_source data_source_id column CWTS data source identifier. See [data_source] table. work_detail table Each row in this table represents the main bibliographic details of a work. work_detail work_id column OpenAlex work identifier. See [work] table. work_detail author_first column Name of the first author of the work. @@ -341,11 +447,6 @@ work_grant work_id column OpenAlex work identifier. See [work] table. work_grant grant_seq column Grant sequence number. work_grant award_id column Award ID. work_grant funder_id column OpenAlex funder identifier. See [funder] table. -work_institution table This table links works to institutions. -work_institution work_id column OpenAlex work identifier. See [work] table. -work_institution institution_seq column Institution sequence number. -work_institution institution_id column OpenAlex institution identifier. See [institution] table. -work_institution institution_name_id column OpenAlex institution name identifier. See [institution_name] table. work_keyword table This table links works to keywords. work_keyword work_id column OpenAlex work identifier. See [work] table. work_keyword keyword_seq column Keyword sequence number. @@ -386,6 +487,12 @@ work_sustainable_development_goal score column The strength of the connection be work_title table Each row in this table represents the title of a work. work_title work_id column OpenAlex work identifier. See [work] table. work_title title column Title of the work. +work_topic table This table links works to topics. +work_topic work_id column OpenAlex work identifier. See [work] table. +work_topic topic_seq column Topic sequence number. +work_topic topic_id column OpenAlex topic identifier. See [topic] table. +work_topic score column The strength of the connection between the work and the topic, from 0 to 1 (higher is stronger). +work_topic is_primary_topic column Topic is the primary topic of the work (0 = no, 1 = yes). work_type table Each row in this table represents a type of work (e.g., 'article' or 'book-chapter'). work_type work_type_id column CWTS work type identifier. work_type work_type column Work type. diff --git a/doc/img/apc_provenance.pdf b/doc/img/apc_provenance.pdf new file mode 100644 index 0000000..57dd8f2 Binary files /dev/null and b/doc/img/apc_provenance.pdf differ diff --git a/doc/img/author.pdf b/doc/img/author.pdf index 514477f..2ee9822 100644 Binary files a/doc/img/author.pdf and b/doc/img/author.pdf differ diff --git a/doc/img/author_alternative_name.pdf b/doc/img/author_alternative_name.pdf index f0ec9ce..a18f2b4 100644 Binary files a/doc/img/author_alternative_name.pdf and b/doc/img/author_alternative_name.pdf differ diff --git a/doc/img/author_institution.pdf b/doc/img/author_institution.pdf new file mode 100644 index 0000000..eb339d2 Binary files /dev/null and b/doc/img/author_institution.pdf differ diff --git a/doc/img/author_institution_year.pdf b/doc/img/author_institution_year.pdf new file mode 100644 index 0000000..9a87c4e Binary files /dev/null and b/doc/img/author_institution_year.pdf differ diff --git a/doc/img/author_last_known_institution.pdf b/doc/img/author_last_known_institution.pdf new file mode 100644 index 0000000..f95a880 Binary files /dev/null and b/doc/img/author_last_known_institution.pdf differ diff --git a/doc/img/author_position.pdf b/doc/img/author_position.pdf index ea220c2..967d0d7 100644 Binary files a/doc/img/author_position.pdf and b/doc/img/author_position.pdf differ diff --git a/doc/img/citation.pdf b/doc/img/citation.pdf index b8b190d..5220918 100644 Binary files a/doc/img/citation.pdf and b/doc/img/citation.pdf differ diff --git a/doc/img/city.pdf b/doc/img/city.pdf index 2ee1e79..64cd202 100644 Binary files a/doc/img/city.pdf and b/doc/img/city.pdf differ diff --git a/doc/img/concept.pdf b/doc/img/concept.pdf index 040eea4..bdab353 100644 Binary files a/doc/img/concept.pdf and b/doc/img/concept.pdf differ diff --git a/doc/img/concept_ancestor.pdf b/doc/img/concept_ancestor.pdf index 9ed151d..8e55fc6 100644 Binary files a/doc/img/concept_ancestor.pdf and b/doc/img/concept_ancestor.pdf differ diff --git a/doc/img/concept_international_description.pdf b/doc/img/concept_international_description.pdf index 1c8aa79..9fb0c76 100644 Binary files a/doc/img/concept_international_description.pdf and b/doc/img/concept_international_description.pdf differ diff --git a/doc/img/concept_international_name.pdf b/doc/img/concept_international_name.pdf index f2fcec0..db31c02 100644 Binary files a/doc/img/concept_international_name.pdf and b/doc/img/concept_international_name.pdf differ diff --git a/doc/img/concept_related.pdf b/doc/img/concept_related.pdf index da4d5e3..6139513 100644 Binary files a/doc/img/concept_related.pdf and b/doc/img/concept_related.pdf differ diff --git a/doc/img/concept_umls_aui.pdf b/doc/img/concept_umls_aui.pdf index bb405fd..c45185f 100644 Binary files a/doc/img/concept_umls_aui.pdf and b/doc/img/concept_umls_aui.pdf differ diff --git a/doc/img/concept_umls_cui.pdf b/doc/img/concept_umls_cui.pdf index ae6911f..8259b19 100644 Binary files a/doc/img/concept_umls_cui.pdf and b/doc/img/concept_umls_cui.pdf differ diff --git a/doc/img/country.pdf b/doc/img/country.pdf index f522152..8a0c5ff 100644 Binary files a/doc/img/country.pdf and b/doc/img/country.pdf differ diff --git a/doc/img/data_source.pdf b/doc/img/data_source.pdf new file mode 100644 index 0000000..a8068d9 Binary files /dev/null and b/doc/img/data_source.pdf differ diff --git a/doc/img/doi_registration_agency.pdf b/doc/img/doi_registration_agency.pdf new file mode 100644 index 0000000..33d4de2 Binary files /dev/null and b/doc/img/doi_registration_agency.pdf differ diff --git a/doc/img/domain.pdf b/doc/img/domain.pdf new file mode 100644 index 0000000..dc69fd1 Binary files /dev/null and b/doc/img/domain.pdf differ diff --git a/doc/img/domain_alternative_name.pdf b/doc/img/domain_alternative_name.pdf new file mode 100644 index 0000000..3dcaa88 Binary files /dev/null and b/doc/img/domain_alternative_name.pdf differ diff --git a/doc/img/domain_field.pdf b/doc/img/domain_field.pdf new file mode 100644 index 0000000..7c17e13 Binary files /dev/null and b/doc/img/domain_field.pdf differ diff --git a/doc/img/domain_sibling.pdf b/doc/img/domain_sibling.pdf new file mode 100644 index 0000000..a3004ac Binary files /dev/null and b/doc/img/domain_sibling.pdf differ diff --git a/doc/img/field.pdf b/doc/img/field.pdf new file mode 100644 index 0000000..4922815 Binary files /dev/null and b/doc/img/field.pdf differ diff --git a/doc/img/field_alternative_name.pdf b/doc/img/field_alternative_name.pdf new file mode 100644 index 0000000..425e8b9 Binary files /dev/null and b/doc/img/field_alternative_name.pdf differ diff --git a/doc/img/field_sibling.pdf b/doc/img/field_sibling.pdf new file mode 100644 index 0000000..81494a4 Binary files /dev/null and b/doc/img/field_sibling.pdf differ diff --git a/doc/img/field_subfield.pdf b/doc/img/field_subfield.pdf new file mode 100644 index 0000000..09c56d0 Binary files /dev/null and b/doc/img/field_subfield.pdf differ diff --git a/doc/img/full_database_diagram.pdf b/doc/img/full_database_diagram.pdf index be2b207..ed123d0 100644 Binary files a/doc/img/full_database_diagram.pdf and b/doc/img/full_database_diagram.pdf differ diff --git a/doc/img/fulltext_origin.pdf b/doc/img/fulltext_origin.pdf new file mode 100644 index 0000000..16fedfb Binary files /dev/null and b/doc/img/fulltext_origin.pdf differ diff --git a/doc/img/funder.pdf b/doc/img/funder.pdf new file mode 100644 index 0000000..66b67be Binary files /dev/null and b/doc/img/funder.pdf differ diff --git a/doc/img/funder_alternative_name.pdf b/doc/img/funder_alternative_name.pdf new file mode 100644 index 0000000..5b00650 Binary files /dev/null and b/doc/img/funder_alternative_name.pdf differ diff --git a/doc/img/funder_publisher.pdf b/doc/img/funder_publisher.pdf new file mode 100644 index 0000000..b7fa4b7 Binary files /dev/null and b/doc/img/funder_publisher.pdf differ diff --git a/doc/img/institution.pdf b/doc/img/institution.pdf index d153aa9..9fe64dd 100644 Binary files a/doc/img/institution.pdf and b/doc/img/institution.pdf differ diff --git a/doc/img/institution_acronym.pdf b/doc/img/institution_acronym.pdf index 2379707..f5905cb 100644 Binary files a/doc/img/institution_acronym.pdf and b/doc/img/institution_acronym.pdf differ diff --git a/doc/img/institution_alternative_name.pdf b/doc/img/institution_alternative_name.pdf index 3e1fcff..852ac98 100644 Binary files a/doc/img/institution_alternative_name.pdf and b/doc/img/institution_alternative_name.pdf differ diff --git a/doc/img/institution_associated.pdf b/doc/img/institution_associated.pdf index 1ca7992..52defdc 100644 Binary files a/doc/img/institution_associated.pdf and b/doc/img/institution_associated.pdf differ diff --git a/doc/img/institution_funder.pdf b/doc/img/institution_funder.pdf new file mode 100644 index 0000000..a0b2b13 Binary files /dev/null and b/doc/img/institution_funder.pdf differ diff --git a/doc/img/institution_international_name.pdf b/doc/img/institution_international_name.pdf index 0b096b2..6bca8c0 100644 Binary files a/doc/img/institution_international_name.pdf and b/doc/img/institution_international_name.pdf differ diff --git a/doc/img/institution_lineage.pdf b/doc/img/institution_lineage.pdf new file mode 100644 index 0000000..b3a739d Binary files /dev/null and b/doc/img/institution_lineage.pdf differ diff --git a/doc/img/institution_name.pdf b/doc/img/institution_name.pdf deleted file mode 100644 index 052a493..0000000 Binary files a/doc/img/institution_name.pdf and /dev/null differ diff --git a/doc/img/institution_publisher.pdf b/doc/img/institution_publisher.pdf new file mode 100644 index 0000000..55ce706 Binary files /dev/null and b/doc/img/institution_publisher.pdf differ diff --git a/doc/img/institution_relationship_type.pdf b/doc/img/institution_relationship_type.pdf index 4bc5ba5..76dfa7c 100644 Binary files a/doc/img/institution_relationship_type.pdf and b/doc/img/institution_relationship_type.pdf differ diff --git a/doc/img/institution_repository.pdf b/doc/img/institution_repository.pdf new file mode 100644 index 0000000..3406a68 Binary files /dev/null and b/doc/img/institution_repository.pdf differ diff --git a/doc/img/institution_type.pdf b/doc/img/institution_type.pdf index 1c97e0e..2c8e3da 100644 Binary files a/doc/img/institution_type.pdf and b/doc/img/institution_type.pdf differ diff --git a/doc/img/keyword.pdf b/doc/img/keyword.pdf new file mode 100644 index 0000000..7a5f36c Binary files /dev/null and b/doc/img/keyword.pdf differ diff --git a/doc/img/license.pdf b/doc/img/license.pdf index ff543b5..0148871 100644 Binary files a/doc/img/license.pdf and b/doc/img/license.pdf differ diff --git a/doc/img/mesh_descriptor.pdf b/doc/img/mesh_descriptor.pdf index 477ed4a..a4198eb 100644 Binary files a/doc/img/mesh_descriptor.pdf and b/doc/img/mesh_descriptor.pdf differ diff --git a/doc/img/mesh_qualifier.pdf b/doc/img/mesh_qualifier.pdf index f02024c..be887b2 100644 Binary files a/doc/img/mesh_qualifier.pdf and b/doc/img/mesh_qualifier.pdf differ diff --git a/doc/img/oa_status.pdf b/doc/img/oa_status.pdf index 609c88c..a3bc266 100644 Binary files a/doc/img/oa_status.pdf and b/doc/img/oa_status.pdf differ diff --git a/doc/img/publisher.pdf b/doc/img/publisher.pdf index dcda5e0..8171e2f 100644 Binary files a/doc/img/publisher.pdf and b/doc/img/publisher.pdf differ diff --git a/doc/img/publisher_alternative_name.pdf b/doc/img/publisher_alternative_name.pdf index 3ee9dbb..6f5da18 100644 Binary files a/doc/img/publisher_alternative_name.pdf and b/doc/img/publisher_alternative_name.pdf differ diff --git a/doc/img/publisher_country.pdf b/doc/img/publisher_country.pdf index 6e8d26f..9926712 100644 Binary files a/doc/img/publisher_country.pdf and b/doc/img/publisher_country.pdf differ diff --git a/doc/img/raw_affiliation_string.pdf b/doc/img/raw_affiliation_string.pdf index 3dfa920..6688dbc 100644 Binary files a/doc/img/raw_affiliation_string.pdf and b/doc/img/raw_affiliation_string.pdf differ diff --git a/doc/img/raw_author_name.pdf b/doc/img/raw_author_name.pdf new file mode 100644 index 0000000..1ef549b Binary files /dev/null and b/doc/img/raw_author_name.pdf differ diff --git a/doc/img/region.pdf b/doc/img/region.pdf index 36a911e..155c44b 100644 Binary files a/doc/img/region.pdf and b/doc/img/region.pdf differ diff --git a/doc/img/source.pdf b/doc/img/source.pdf index 8a37118..7556a59 100644 Binary files a/doc/img/source.pdf and b/doc/img/source.pdf differ diff --git a/doc/img/source_alternative_title.pdf b/doc/img/source_alternative_title.pdf index 7ddb1b5..78f3212 100644 Binary files a/doc/img/source_alternative_title.pdf and b/doc/img/source_alternative_title.pdf differ diff --git a/doc/img/source_apc_price.pdf b/doc/img/source_apc_price.pdf new file mode 100644 index 0000000..f4475b1 Binary files /dev/null and b/doc/img/source_apc_price.pdf differ diff --git a/doc/img/source_issn.pdf b/doc/img/source_issn.pdf index 4e50399..572adaf 100644 Binary files a/doc/img/source_issn.pdf and b/doc/img/source_issn.pdf differ diff --git a/doc/img/source_name.pdf b/doc/img/source_name.pdf deleted file mode 100644 index e36f39b..0000000 Binary files a/doc/img/source_name.pdf and /dev/null differ diff --git a/doc/img/source_society.pdf b/doc/img/source_society.pdf index 58f3f5a..92538ad 100644 Binary files a/doc/img/source_society.pdf and b/doc/img/source_society.pdf differ diff --git a/doc/img/source_type.pdf b/doc/img/source_type.pdf index df0a5bc..0f85f37 100644 Binary files a/doc/img/source_type.pdf and b/doc/img/source_type.pdf differ diff --git a/doc/img/subfield.pdf b/doc/img/subfield.pdf new file mode 100644 index 0000000..c84d676 Binary files /dev/null and b/doc/img/subfield.pdf differ diff --git a/doc/img/subfield_alternative_name.pdf b/doc/img/subfield_alternative_name.pdf new file mode 100644 index 0000000..7d2b77b Binary files /dev/null and b/doc/img/subfield_alternative_name.pdf differ diff --git a/doc/img/subfield_sibling.pdf b/doc/img/subfield_sibling.pdf new file mode 100644 index 0000000..4d0bfa2 Binary files /dev/null and b/doc/img/subfield_sibling.pdf differ diff --git a/doc/img/subfield_topic.pdf b/doc/img/subfield_topic.pdf new file mode 100644 index 0000000..cb659c1 Binary files /dev/null and b/doc/img/subfield_topic.pdf differ diff --git a/doc/img/sustainable_development_goal.pdf b/doc/img/sustainable_development_goal.pdf new file mode 100644 index 0000000..7e2c2e9 Binary files /dev/null and b/doc/img/sustainable_development_goal.pdf differ diff --git a/doc/img/topic.pdf b/doc/img/topic.pdf new file mode 100644 index 0000000..bf5d6f3 Binary files /dev/null and b/doc/img/topic.pdf differ diff --git a/doc/img/topic_keyword.pdf b/doc/img/topic_keyword.pdf new file mode 100644 index 0000000..8af9bbf Binary files /dev/null and b/doc/img/topic_keyword.pdf differ diff --git a/doc/img/topic_sibling.pdf b/doc/img/topic_sibling.pdf new file mode 100644 index 0000000..5839bcd Binary files /dev/null and b/doc/img/topic_sibling.pdf differ diff --git a/doc/img/version.pdf b/doc/img/version.pdf index 80420e4..14ce612 100644 Binary files a/doc/img/version.pdf and b/doc/img/version.pdf differ diff --git a/doc/img/work.pdf b/doc/img/work.pdf index 2280278..82cb422 100644 Binary files a/doc/img/work.pdf and b/doc/img/work.pdf differ diff --git a/doc/img/work_abstract.pdf b/doc/img/work_abstract.pdf index 84e73e4..7b45626 100644 Binary files a/doc/img/work_abstract.pdf and b/doc/img/work_abstract.pdf differ diff --git a/doc/img/work_affiliation.pdf b/doc/img/work_affiliation.pdf new file mode 100644 index 0000000..790b575 Binary files /dev/null and b/doc/img/work_affiliation.pdf differ diff --git a/doc/img/work_affiliation_institution.pdf b/doc/img/work_affiliation_institution.pdf new file mode 100644 index 0000000..57fcda5 Binary files /dev/null and b/doc/img/work_affiliation_institution.pdf differ diff --git a/doc/img/work_author.pdf b/doc/img/work_author.pdf index 3305652..b37cf9d 100644 Binary files a/doc/img/work_author.pdf and b/doc/img/work_author.pdf differ diff --git a/doc/img/work_author_affiliation.pdf b/doc/img/work_author_affiliation.pdf new file mode 100644 index 0000000..8bb707b Binary files /dev/null and b/doc/img/work_author_affiliation.pdf differ diff --git a/doc/img/work_author_country.pdf b/doc/img/work_author_country.pdf new file mode 100644 index 0000000..03af2bc Binary files /dev/null and b/doc/img/work_author_country.pdf differ diff --git a/doc/img/work_author_institution.pdf b/doc/img/work_author_institution.pdf deleted file mode 100644 index c000754..0000000 Binary files a/doc/img/work_author_institution.pdf and /dev/null differ diff --git a/doc/img/work_concept.pdf b/doc/img/work_concept.pdf index b3d2a66..f7d699d 100644 Binary files a/doc/img/work_concept.pdf and b/doc/img/work_concept.pdf differ diff --git a/doc/img/work_data_source.pdf b/doc/img/work_data_source.pdf new file mode 100644 index 0000000..827bf13 Binary files /dev/null and b/doc/img/work_data_source.pdf differ diff --git a/doc/img/work_detail.pdf b/doc/img/work_detail.pdf index 938ff0d..735cb4a 100644 Binary files a/doc/img/work_detail.pdf and b/doc/img/work_detail.pdf differ diff --git a/doc/img/work_grant.pdf b/doc/img/work_grant.pdf new file mode 100644 index 0000000..513334e Binary files /dev/null and b/doc/img/work_grant.pdf differ diff --git a/doc/img/work_institution.pdf b/doc/img/work_institution.pdf deleted file mode 100644 index 4c0b4d2..0000000 Binary files a/doc/img/work_institution.pdf and /dev/null differ diff --git a/doc/img/work_keyword.pdf b/doc/img/work_keyword.pdf new file mode 100644 index 0000000..547929a Binary files /dev/null and b/doc/img/work_keyword.pdf differ diff --git a/doc/img/work_location.pdf b/doc/img/work_location.pdf new file mode 100644 index 0000000..a970382 Binary files /dev/null and b/doc/img/work_location.pdf differ diff --git a/doc/img/work_mesh.pdf b/doc/img/work_mesh.pdf index 8031098..4f85563 100644 Binary files a/doc/img/work_mesh.pdf and b/doc/img/work_mesh.pdf differ diff --git a/doc/img/work_reference.pdf b/doc/img/work_reference.pdf index 94a3f1f..359bfa0 100644 Binary files a/doc/img/work_reference.pdf and b/doc/img/work_reference.pdf differ diff --git a/doc/img/work_related.pdf b/doc/img/work_related.pdf index 12230d0..adb8bd8 100644 Binary files a/doc/img/work_related.pdf and b/doc/img/work_related.pdf differ diff --git a/doc/img/work_sustainable_development_goal.pdf b/doc/img/work_sustainable_development_goal.pdf new file mode 100644 index 0000000..9d55f54 Binary files /dev/null and b/doc/img/work_sustainable_development_goal.pdf differ diff --git a/doc/img/work_title.pdf b/doc/img/work_title.pdf index 222e3a6..3b23756 100644 Binary files a/doc/img/work_title.pdf and b/doc/img/work_title.pdf differ diff --git a/doc/img/work_topic.pdf b/doc/img/work_topic.pdf new file mode 100644 index 0000000..3549398 Binary files /dev/null and b/doc/img/work_topic.pdf differ diff --git a/doc/img/work_type.pdf b/doc/img/work_type.pdf index 2991d54..d6514fc 100644 Binary files a/doc/img/work_type.pdf and b/doc/img/work_type.pdf differ diff --git a/etl-tooling b/etl-tooling index 11e7eb7..afdc11d 160000 --- a/etl-tooling +++ b/etl-tooling @@ -1 +1 @@ -Subproject commit 11e7eb7421225cb73971f862e890736c50b4b9d7 +Subproject commit afdc11dafafb6b7ded4a72a1bd1d44b37180b77e diff --git a/src/02_extract_data.bat b/src/02_extract_data.bat index b9fc2c4..75e1717 100644 --- a/src/02_extract_data.bat +++ b/src/02_extract_data.bat @@ -35,6 +35,18 @@ call %functions%\unzip_folder.bat ^ %zip_log_folder%\concepts ^ flatten_folder_structure +call %functions%\unzip_folder.bat ^ + %download_json_files_data_folder%\data\domains "gz" ^ + %extract_json_files_data_folder%\domains ^ + %zip_log_folder%\domains ^ + flatten_folder_structure + +call %functions%\unzip_folder.bat ^ + %download_json_files_data_folder%\data\fields "gz" ^ + %extract_json_files_data_folder%\fields ^ + %zip_log_folder%\fields ^ + flatten_folder_structure + call %functions%\unzip_folder.bat ^ %download_json_files_data_folder%\data\funders "gz" ^ %extract_json_files_data_folder%\funders ^ @@ -59,6 +71,18 @@ call %functions%\unzip_folder.bat ^ %zip_log_folder%\sources ^ flatten_folder_structure +call %functions%\unzip_folder.bat ^ + %download_json_files_data_folder%\data\subfields "gz" ^ + %extract_json_files_data_folder%\subfields ^ + %zip_log_folder%\subfields ^ + flatten_folder_structure + +call %functions%\unzip_folder.bat ^ + %download_json_files_data_folder%\data\topics "gz" ^ + %extract_json_files_data_folder%\topics ^ + %zip_log_folder%\topics ^ + flatten_folder_structure + for /L %%i in (1,1,%number_of_processes%) do ( start /min %functions%\unzip_folder.bat ^ %process_json_files_data_folder%\authors\%%i "gz" ^ diff --git a/src/03_analyze_json_files.bat b/src/03_analyze_json_files.bat index 69fd705..364c7b8 100644 --- a/src/03_analyze_json_files.bat +++ b/src/03_analyze_json_files.bat @@ -7,10 +7,14 @@ call settings.bat :: ======================================================================================= call :analyze_json_tags %extract_json_files_data_folder%\concepts +call :analyze_json_tags %extract_json_files_data_folder%\domains +call :analyze_json_tags %extract_json_files_data_folder%\fields call :analyze_json_tags %extract_json_files_data_folder%\funders call :analyze_json_tags %extract_json_files_data_folder%\institutions call :analyze_json_tags %extract_json_files_data_folder%\publishers call :analyze_json_tags %extract_json_files_data_folder%\sources +call :analyze_json_tags %extract_json_files_data_folder%\subfields +call :analyze_json_tags %extract_json_files_data_folder%\topics call :analyze_json_tags %extract_json_files_data_folder%\authors call :analyze_json_tags %extract_json_files_data_folder%\works diff --git a/src/04_parse_json.bat b/src/04_parse_json.bat index 63c04d4..fa5b622 100644 --- a/src/04_parse_json.bat +++ b/src/04_parse_json.bat @@ -20,6 +20,24 @@ call %functions%\json_parse_data.bat ^ %json_parser_log_folder%\concepts ^ erase_previous +call %functions%\json_parse_data.bat ^ + %domains_json_db_name% ^ + openalexdomains ^ + %process_folder%\domains ^ + %extract_json_files_data_folder%\domains ^ + %generated_sql_scripts_data_folder%\domains ^ + %json_parser_log_folder%\domains ^ + erase_previous + +call %functions%\json_parse_data.bat ^ + %fields_json_db_name% ^ + openalexfields ^ + %process_folder%\fields ^ + %extract_json_files_data_folder%\fields ^ + %generated_sql_scripts_data_folder%\fields ^ + %json_parser_log_folder%\fields ^ + erase_previous + call %functions%\json_parse_data.bat ^ %funders_json_db_name% ^ openalexfunders ^ @@ -56,6 +74,24 @@ call %functions%\json_parse_data.bat ^ %json_parser_log_folder%\sources ^ erase_previous +call %functions%\json_parse_data.bat ^ + %subfields_json_db_name% ^ + openalexsubfields ^ + %process_folder%\subfields ^ + %extract_json_files_data_folder%\subfields ^ + %generated_sql_scripts_data_folder%\subfields ^ + %json_parser_log_folder%\subfields ^ + erase_previous + +call %functions%\json_parse_data.bat ^ + %topics_json_db_name% ^ + openalextopics ^ + %process_folder%\topics ^ + %extract_json_files_data_folder%\topics ^ + %generated_sql_scripts_data_folder%\topics ^ + %json_parser_log_folder%\topics ^ + erase_previous + for /L %%i in (1,1,%number_of_processes%) do ( start /min %functions%\json_parse_data.bat ^ %authors_json_db_name% ^ diff --git a/src/05_load_json_databases.bat b/src/05_load_json_databases.bat index 04a60c8..f2222d6 100644 --- a/src/05_load_json_databases.bat +++ b/src/05_load_json_databases.bat @@ -40,6 +40,64 @@ call %functions%\bcp_data.bat ^ call %functions%\validate_database.bat %concepts_json_db_name% call %functions%\validate_data_types.bat %concepts_json_db_name% +:: DOMAINS ------------------------------------------------------------------------------ + +set db_filegrowth=1GB +call %functions%\create_database.bat ^ + %domains_json_db_name% ^ + %json_sql_src_folder% ^ + %json_sql_log_folder%\domains +call %functions%\check_errors.bat +set "db_filegrowth=" + +call %functions%\run_sql_script.bat ^ + %domains_json_db_name% ^ + %process_folder%\domains\create_tables.sql ^ + %json_sql_log_folder%\domains ^ + "" +call %functions%\check_errors.bat + +call %functions%\apply_page_compression.bat ^ + %domains_json_db_name% ^ + %json_sql_log_folder%\domains + +call %functions%\bcp_data.bat ^ + %domains_json_db_name% ^ + %process_folder%\domains ^ + %bcp_log_folder%\domains + +call %functions%\validate_database.bat %domains_json_db_name% +call %functions%\validate_data_types.bat %domains_json_db_name% + +:: FIELDS ------------------------------------------------------------------------------ + +set db_filegrowth=1GB +call %functions%\create_database.bat ^ + %fields_json_db_name% ^ + %json_sql_src_folder% ^ + %json_sql_log_folder%\fields +call %functions%\check_errors.bat +set "db_filegrowth=" + +call %functions%\run_sql_script.bat ^ + %fields_json_db_name% ^ + %process_folder%\fields\create_tables.sql ^ + %json_sql_log_folder%\fields ^ + "" +call %functions%\check_errors.bat + +call %functions%\apply_page_compression.bat ^ + %fields_json_db_name% ^ + %json_sql_log_folder%\fields + +call %functions%\bcp_data.bat ^ + %fields_json_db_name% ^ + %process_folder%\fields ^ + %bcp_log_folder%\fields + +call %functions%\validate_database.bat %fields_json_db_name% +call %functions%\validate_data_types.bat %fields_json_db_name% + :: FUNDERS ------------------------------------------------------------------------------ set db_filegrowth=1GB @@ -156,6 +214,64 @@ call %functions%\bcp_data.bat ^ call %functions%\validate_database.bat %sources_json_db_name% call %functions%\validate_data_types.bat %sources_json_db_name% +:: SUBFIELDS ------------------------------------------------------------------------------ + +set db_filegrowth=1GB +call %functions%\create_database.bat ^ + %subfields_json_db_name% ^ + %json_sql_src_folder% ^ + %json_sql_log_folder%\subfields +call %functions%\check_errors.bat +set "db_filegrowth=" + +call %functions%\run_sql_script.bat ^ + %subfields_json_db_name% ^ + %process_folder%\subfields\create_tables.sql ^ + %json_sql_log_folder%\subfields ^ + "" +call %functions%\check_errors.bat + +call %functions%\apply_page_compression.bat ^ + %subfields_json_db_name% ^ + %json_sql_log_folder%\subfields + +call %functions%\bcp_data.bat ^ + %subfields_json_db_name% ^ + %process_folder%\subfields ^ + %bcp_log_folder%\subfields + +call %functions%\validate_database.bat %subfields_json_db_name% +call %functions%\validate_data_types.bat %subfields_json_db_name% + +:: TOPICS ------------------------------------------------------------------------------ + +set db_filegrowth=1GB +call %functions%\create_database.bat ^ + %topics_json_db_name% ^ + %json_sql_src_folder% ^ + %json_sql_log_folder%\topics +call %functions%\check_errors.bat +set "db_filegrowth=" + +call %functions%\run_sql_script.bat ^ + %topics_json_db_name% ^ + %process_folder%\topics\create_tables.sql ^ + %json_sql_log_folder%\topics ^ + "" +call %functions%\check_errors.bat + +call %functions%\apply_page_compression.bat ^ + %topics_json_db_name% ^ + %json_sql_log_folder%\topics + +call %functions%\bcp_data.bat ^ + %topics_json_db_name% ^ + %process_folder%\topics ^ + %bcp_log_folder%\topics + +call %functions%\validate_database.bat %topics_json_db_name% +call %functions%\validate_data_types.bat %topics_json_db_name% + :: AUTHORS ------------------------------------------------------------------------------ call %functions%\create_database.bat ^ diff --git a/src/06_load_relational_database.bat b/src/06_load_relational_database.bat index e6615ed..5c1a548 100644 --- a/src/06_load_relational_database.bat +++ b/src/06_load_relational_database.bat @@ -25,7 +25,7 @@ call %functions%\load_database.bat ^ %relational_db_name% ^ %relational_sql_src_folder% ^ %relational_sql_log_folder% ^ - "-v previous_relational_db_name=%previous_relational_db_name% authors_json_db_name=%authors_json_db_name% concepts_json_db_name=%concepts_json_db_name% institutions_json_db_name=%institutions_json_db_name% publishers_json_db_name=%publishers_json_db_name% sources_json_db_name=%sources_json_db_name% works_json_db_name=%works_json_db_name% etl_db_name=%etl_db_name%" + "-v previous_relational_db_name=%previous_relational_db_name% authors_json_db_name=%authors_json_db_name% concepts_json_db_name=%concepts_json_db_name% domains_json_db_name=%domains_json_db_name% fields_json_db_name=%fields_json_db_name% institutions_json_db_name=%institutions_json_db_name% publishers_json_db_name=%publishers_json_db_name% sources_json_db_name=%sources_json_db_name% subfields_json_db_name=%subfields_json_db_name% topics_json_db_name=%topics_json_db_name% works_json_db_name=%works_json_db_name% etl_db_name=%etl_db_name%" call %functions%\check_errors.bat call %functions%\validate_database.bat %relational_db_name% diff --git a/src/08_load_classification_database.bat b/src/08_load_classification_database.bat index be3c834..d56d6dd 100644 --- a/src/08_load_classification_database.bat +++ b/src/08_load_classification_database.bat @@ -107,6 +107,34 @@ goto:eof :: ======================================================================================= +:: ======================================================================================= +:copy_previous_classification +:: ======================================================================================= + +call %functions%\run_sql_script.bat ^ + %classification_db_name% ^ + %classification_sql_src_folder%\copy_publicationclassification.sql ^ + %classification_sql_log_folder% ^ + "-v previous_classification_db_name=%previous_classification_db_name%" + +goto:eof +:: ======================================================================================= + + +:: ======================================================================================= +:complement_classification +:: ======================================================================================= + +call %functions%\run_sql_script.bat ^ + %classification_db_name% ^ + %classification_sql_src_folder%\complement_publicationclassification.sql ^ + %classification_sql_log_folder% ^ + "-v relational_db_name=%relational_db_name%" + +goto:eof +:: ======================================================================================= + + :: ======================================================================================= :create_labeling :: ======================================================================================= @@ -131,6 +159,20 @@ goto:eof :: ======================================================================================= +:: ======================================================================================= +:copy_previous_labeling +:: ======================================================================================= + +call %functions%\run_sql_script.bat ^ + %classification_db_name% ^ + %classification_sql_src_folder%\copy_publicationclassificationlabeling.sql ^ + %classification_sql_log_folder% ^ + "-v previous_classification_db_name=%previous_classification_db_name%" + +goto:eof +:: ======================================================================================= + + :: ======================================================================================= :table_scripts :: ======================================================================================= @@ -204,11 +246,14 @@ goto:eof echo Choose step(s) to run (option numbers, [space] separated) echo Option 0: create_database echo Option 1: create_classification -echo Option 2: create_labeling -echo Option 3: table_scripts -echo Option 4: create_vosviewer_maps -echo Option 5: load_vosviewer_maps_only -echo Option 6: validate +echo Option 2: copy_previous_classification +echo Option 3: complement_classification +echo Option 4: create_labeling +echo Option 5: copy_previous_labeling +echo Option 6: table_scripts +echo Option 7: create_vosviewer_maps +echo Option 8: load_vosviewer_maps_only +echo Option 9: validate set /p type_of_load="Enter option: " @@ -226,11 +271,14 @@ set "type_of_load=%type_of_load% " if not "%type_of_load:0 =%" == "%type_of_load%" ( set "run=1" && call :create_database ) if not "%type_of_load:1 =%" == "%type_of_load%" ( set "run=1" && call :create_classification ) -if not "%type_of_load:2 =%" == "%type_of_load%" ( set "run=1" && call :create_labeling ) -if not "%type_of_load:3 =%" == "%type_of_load%" ( set "run=1" && call :table_scripts ) -if not "%type_of_load:4 =%" == "%type_of_load%" ( set "run=1" && call :create_vosviewer_maps ) -if not "%type_of_load:5 =%" == "%type_of_load%" ( set "run=1" && call :load_vosviewer_maps ) -if not "%type_of_load:6 =%" == "%type_of_load%" ( set "run=1" && call :validate ) +if not "%type_of_load:2 =%" == "%type_of_load%" ( set "run=1" && call :copy_previous_classification ) +if not "%type_of_load:3 =%" == "%type_of_load%" ( set "run=1" && call :complement_classification ) +if not "%type_of_load:4 =%" == "%type_of_load%" ( set "run=1" && call :create_labeling ) +if not "%type_of_load:5 =%" == "%type_of_load%" ( set "run=1" && call :copy_previous_labeling ) +if not "%type_of_load:6 =%" == "%type_of_load%" ( set "run=1" && call :table_scripts ) +if not "%type_of_load:7 =%" == "%type_of_load%" ( set "run=1" && call :create_vosviewer_maps ) +if not "%type_of_load:8 =%" == "%type_of_load%" ( set "run=1" && call :load_vosviewer_maps ) +if not "%type_of_load:9 =%" == "%type_of_load%" ( set "run=1" && call :validate ) if "%run%" == "0" ( echo No valid input diff --git a/src/10_load_indicators_database.bat b/src/10_load_indicators_database.bat index afb8346..5ef071b 100644 --- a/src/10_load_indicators_database.bat +++ b/src/10_load_indicators_database.bat @@ -29,21 +29,21 @@ call %functions%\run_sql_script.bat ^ %indicators_db_name% ^ %indicators_sql_src_folder%\create_func_constants.sql ^ %indicators_sql_log_folder% ^ - "" + "-v indicators_max_pub_year=%indicators_max_pub_year%" call %functions%\check_errors.bat call %functions%\run_sql_folder.bat ^ %indicators_db_name% ^ %indicators_sql_src_folder%\table_scripts ^ %indicators_sql_log_folder% ^ - "-v relational_db_name=%relational_db_name% indicators_min_pub_year=%indicators_min_pub_year%" + "-v relational_db_name=%relational_db_name% core_db_name=%core_db_name% classification_db_name=%classification_db_name% indicators_min_pub_year=%indicators_min_pub_year%" call %functions%\check_errors.bat call %functions%\run_sql_folder.bat ^ %indicators_db_name% ^ %indicators_sql_src_folder%\stored_procedures ^ %indicators_sql_log_folder% ^ - "-v relational_db_name=%relational_db_name% indicators_min_pub_year=%indicators_min_pub_year% indicators_max_pub_year=%indicators_max_pub_year%" + "-v indicators_min_pub_year=%indicators_min_pub_year%" call %functions%\check_errors.bat call %functions%\validate_database.bat %indicators_db_name% diff --git a/src/11_shrink_databases.bat b/src/11_shrink_databases.bat index 6589ad4..d1810d2 100644 --- a/src/11_shrink_databases.bat +++ b/src/11_shrink_databases.bat @@ -29,10 +29,14 @@ echo Choose databases to shrink (names or numbers [space] separated) echo Option 0: all echo Option 1: %authors_json_db_name% echo %concepts_json_db_name% +echo %domains_json_db_name% +echo %fields_json_db_name% echo %funders_json_db_name% echo %institutions_json_db_name% echo %publishers_json_db_name% echo %sources_json_db_name% +echo %subfields_json_db_name% +echo %topics_json_db_name% echo %works_json_db_name% echo Option 2: %relational_db_name% echo Option 3: %text_db_name% @@ -87,6 +91,14 @@ call %functions%\shrink_database.bat ^ %concepts_json_db_name% ^ %json_sql_log_folder%\concepts +call %functions%\shrink_database.bat ^ + %domains_json_db_name% ^ + %json_sql_log_folder%\domains + +call %functions%\shrink_database.bat ^ + %fields_json_db_name% ^ + %json_sql_log_folder%\fields + call %functions%\shrink_database.bat ^ %funders_json_db_name% ^ %json_sql_log_folder%\funders @@ -103,6 +115,14 @@ call %functions%\shrink_database.bat ^ %sources_json_db_name% ^ %json_sql_log_folder%\sources +call %functions%\shrink_database.bat ^ + %subfields_json_db_name% ^ + %json_sql_log_folder%\subfields + +call %functions%\shrink_database.bat ^ + %topics_json_db_name% ^ + %json_sql_log_folder%\topics + call %functions%\shrink_database.bat ^ %works_json_db_name% ^ %json_sql_log_folder%\works diff --git a/src/13_release_databases.bat b/src/13_release_databases.bat index 3fbaea9..d84cdce 100644 --- a/src/13_release_databases.bat +++ b/src/13_release_databases.bat @@ -34,10 +34,14 @@ echo Choose databases to release (option numbers, separated) echo Option 0: all echo Option 1: %authors_json_db_name% echo %concepts_json_db_name% +echo %domains_json_db_name% +echo %fields_json_db_name% echo %funders_json_db_name% echo %institutions_json_db_name% echo %publishers_json_db_name% echo %sources_json_db_name% +echo %subfields_json_db_name% +echo %topics_json_db_name% echo %works_json_db_name% echo Option 2: %relational_db_name% echo Option 3: %text_db_name% @@ -89,6 +93,14 @@ call %functions%\set_database_file_limits.bat ^ %concepts_json_db_name% ^ %json_sql_log_folder%\concepts +call %functions%\set_database_file_limits.bat ^ + %domains_json_db_name% ^ + %json_sql_log_folder%\domains + +call %functions%\set_database_file_limits.bat ^ + %fields_json_db_name% ^ + %json_sql_log_folder%\fields + call %functions%\set_database_file_limits.bat ^ %funders_json_db_name% ^ %json_sql_log_folder%\funders @@ -105,6 +117,14 @@ call %functions%\set_database_file_limits.bat ^ %sources_json_db_name% ^ %json_sql_log_folder%\sources +call %functions%\set_database_file_limits.bat ^ + %subfields_json_db_name% ^ + %json_sql_log_folder%\subfields + +call %functions%\set_database_file_limits.bat ^ + %topics_json_db_name% ^ + %json_sql_log_folder%\topics + call %functions%\set_database_file_limits.bat ^ %works_json_db_name% ^ %json_sql_log_folder%\works diff --git a/src/XX_compare_validation.bat b/src/XX_compare_validation.bat index 13bd38c..7de176e 100644 --- a/src/XX_compare_validation.bat +++ b/src/XX_compare_validation.bat @@ -4,19 +4,8 @@ call settings.bat :: ======================================================================================= :: Main - -::: Prompt the user to choose one or more databases to finalize. -::: Databases are finalized in numbered order -::: if the user selects [3 1 2], the order will still be [1 2 3] -::: The choice can also be passed as an argument to the script -::: use quotes in case of multiple choices ("1 2 3") - -::: For each of the databases the following actions are performed if appropriate -::: - Developer access is revoked (revoke_developer_access.sql) -::: - CWTS group access is granted -::: - Addittional access is granted (grant_access.sql) -::: - File limits are set on the database to prevent further growth :: ======================================================================================= + setlocal set db_name_choice=%~1 @@ -45,14 +34,19 @@ goto:eof echo Choose database echo Option 1: %authors_json_db_name% echo Option 2: %concepts_json_db_name% -echo Option 3: %funders_json_db_name% -echo Option 4: %institutions_json_db_name% -echo Option 5: %publishers_json_db_name% -echo Option 6: %sources_json_db_name% -echo Option 7: %works_json_db_name% -echo Option 8: %relational_db_name% -echo Option 9: %text_db_name% -echo Option 10: %classification_db_name% +echo Option 3: %domains_json_db_name% +echo Option 4: %fields_json_db_name% +echo Option 5: %funders_json_db_name% +echo Option 6: %institutions_json_db_name% +echo Option 7: %publishers_json_db_name% +echo Option 8: %sources_json_db_name% +echo Option 9: %subfields_json_db_name% +echo Option 10: %topics_json_db_name% +echo Option 11: %works_json_db_name% +echo Option 12: %relational_db_name% +echo Option 13: %text_db_name% +echo Option 14: %classification_db_name% +echo Option 15: %core_db_name% set /p db_name_choice="Enter option: " goto:eof @@ -64,14 +58,19 @@ goto:eof :: ======================================================================================= if "%db_name_choice%" == "1" set db_name=%authors_json_db_name% if "%db_name_choice%" == "2" set db_name=%concepts_json_db_name% -if "%db_name_choice%" == "3" set db_name=%funders_json_db_name% -if "%db_name_choice%" == "4" set db_name=%institutions_json_db_name% -if "%db_name_choice%" == "5" set db_name=%publishers_json_db_name% -if "%db_name_choice%" == "6" set db_name=%sources_json_db_name% -if "%db_name_choice%" == "7" set db_name=%works_json_db_name% -if "%db_name_choice%" == "8" set db_name=%relational_db_name% -if "%db_name_choice%" == "9" set db_name=%text_db_name% -if "%db_name_choice%" == "10" set db_name=%classification_db_name% +if "%db_name_choice%" == "3" set db_name=%domains_json_db_name% +if "%db_name_choice%" == "4" set db_name=%fields_json_db_name% +if "%db_name_choice%" == "5" set db_name=%funders_json_db_name% +if "%db_name_choice%" == "6" set db_name=%institutions_json_db_name% +if "%db_name_choice%" == "7" set db_name=%publishers_json_db_name% +if "%db_name_choice%" == "8" set db_name=%sources_json_db_name% +if "%db_name_choice%" == "9" set db_name=%subfields_json_db_name% +if "%db_name_choice%" == "10" set db_name=%topics_json_db_name% +if "%db_name_choice%" == "11" set db_name=%works_json_db_name% +if "%db_name_choice%" == "12" set db_name=%relational_db_name% +if "%db_name_choice%" == "13" set db_name=%text_db_name% +if "%db_name_choice%" == "14" set db_name=%classification_db_name% +if "%db_name_choice%" == "15" set db_name=%core_db_name% if not defined db_name ( set db_name=%db_name_choice% ) diff --git a/src/settings.bat b/src/settings.bat index 33f7705..092ee27 100644 --- a/src/settings.bat +++ b/src/settings.bat @@ -4,10 +4,10 @@ :: Pipeline settings :: --------------------------------------------------------------------------------------- -set db_version=2023nov -set previous_db_version=2023aug +set db_version=2024aug +set previous_db_version=2023nov set db_owner=vuw\%USERNAME% -set database_drive_letter=I +set database_drive_letter=G set notifications=true set verbose=true @@ -33,10 +33,14 @@ set indicators_db_name=openalex_%db_version%_indicators set json_db_name=openalex_%db_version%_json set authors_json_db_name=openalex_%db_version%_authors_json set concepts_json_db_name=openalex_%db_version%_concepts_json +set domains_json_db_name=openalex_%db_version%_domains_json +set fields_json_db_name=openalex_%db_version%_fields_json set funders_json_db_name=openalex_%db_version%_funders_json set institutions_json_db_name=openalex_%db_version%_institutions_json set publishers_json_db_name=openalex_%db_version%_publishers_json set sources_json_db_name=openalex_%db_version%_sources_json +set subfields_json_db_name=openalex_%db_version%_subfields_json +set topics_json_db_name=openalex_%db_version%_topics_json set works_json_db_name=openalex_%db_version%_works_json :: Utility databases @@ -47,6 +51,8 @@ set dba_db_name=cwtsdb_dba :: Classification Settings :: --------------------------------------------------------------------------------------- +set previous_classification_db_name=openalex_%previous_db_version%_classification + set classification_min_pub_year_extended_pub_set=1980 set classification_max_pub_year_extended_pub_set=2023 set classification_min_pub_year_core_pub_set=2000 @@ -68,7 +74,7 @@ set classification_classification_table=classification.pub_cluster :: publicationclassificationlabeling set classification_n_pub_titles_per_cluster=250 set classification_pub_titles_table=classification.cluster_pub_titles -set classification_label_table=classification.cluster_labels +set classification_label_table=classification.cluster_labeling set classification_openai_gpt_model=gpt-3.5-turbo-1106 set classification_print_labeling=%verbose% @@ -83,7 +89,7 @@ set core_min_pub_year_core_pubs=%classification_min_pub_year_core_pub_set% :: --------------------------------------------------------------------------------------- set indicators_min_pub_year=%classification_min_pub_year_core_pub_set% -set indicators_max_pub_year=2022 +set indicators_max_pub_year=2023 :: --------------------------------------------------------------------------------------- :: Terminal Settings diff --git a/src/sql/classification/complement_publicationclassification.sql b/src/sql/classification/complement_publicationclassification.sql new file mode 100644 index 0000000..a97d54b --- /dev/null +++ b/src/sql/classification/complement_publicationclassification.sql @@ -0,0 +1,67 @@ +set nocount on + +drop table if exists #pub_without_cluster +select a.pub_no, a.work_id +into #pub_without_cluster +from [classification].pub as a +left join [classification].pub_cluster as b on a.pub_no = b.pub_no +where b.pub_no is null + +drop table if exists #pub_cluster_based_on_refs +select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no +into #pub_cluster_based_on_refs +from +( + select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no, [rank] = row_number() over (partition by pub_no order by n_refs desc, micro_cluster_no desc) + from + ( + select a.pub_no, d.micro_cluster_no, d.meso_cluster_no, d.macro_cluster_no, n_refs = count(*) + from #pub_without_cluster as a + join $(relational_db_name)..citation as b on a.work_id = b.citing_work_id + join [classification].pub as c on b.cited_work_id = c.work_id + join [classification].pub_cluster as d on c.pub_no = d.pub_no + group by a.pub_no, d.micro_cluster_no, d.meso_cluster_no, d.macro_cluster_no + ) as a +) as a +where [rank] = 1 + +drop table if exists #pub_without_cluster2 +select a.pub_no, a.work_id +into #pub_without_cluster2 +from #pub_without_cluster as a +left join #pub_cluster_based_on_refs as b on a.pub_no = b.pub_no +where b.pub_no is null + +drop table if exists #pub_cluster_based_on_cits +select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no +into #pub_cluster_based_on_cits +from +( + select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no, [rank] = row_number() over (partition by pub_no order by n_cits desc, micro_cluster_no desc) + from + ( + select a.pub_no, d.micro_cluster_no, d.meso_cluster_no, d.macro_cluster_no, n_cits = count(*) + from #pub_without_cluster2 as a + join $(relational_db_name)..citation as b on a.work_id = b.cited_work_id + join [classification].pub as c on b.citing_work_id = c.work_id + join [classification].pub_cluster as d on c.pub_no = d.pub_no + group by a.pub_no, d.micro_cluster_no, d.meso_cluster_no, d.macro_cluster_no + ) as a +) as a +where [rank] = 1 + +drop table if exists #pub_cluster +select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no +into #pub_cluster +from +( + select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no + from #pub_cluster_based_on_refs + union + select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no + from #pub_cluster_based_on_cits +) as a + +insert into [classification].pub_cluster with(tablock) +select pub_no, micro_cluster_no, meso_cluster_no, macro_cluster_no +from #pub_cluster diff --git a/src/sql/classification/copy_publicationclassification.sql b/src/sql/classification/copy_publicationclassification.sql new file mode 100644 index 0000000..aed9a73 --- /dev/null +++ b/src/sql/classification/copy_publicationclassification.sql @@ -0,0 +1,8 @@ +set nocount on + +drop table if exists [classification].pub_cluster +select a.pub_no, c.micro_cluster_no, c.meso_cluster_no, c.macro_cluster_no +into [classification].pub_cluster +from [classification].pub as a +join $(previous_classification_db_name).[classification].pub as b on a.work_id = b.work_id +join $(previous_classification_db_name).[classification].pub_cluster as c on b.pub_no = c.pub_no diff --git a/src/sql/classification/copy_publicationclassificationlabeling.sql b/src/sql/classification/copy_publicationclassificationlabeling.sql new file mode 100644 index 0000000..cd4e550 --- /dev/null +++ b/src/sql/classification/copy_publicationclassificationlabeling.sql @@ -0,0 +1,6 @@ +set nocount on + +drop table if exists [classification].cluster_labeling +select * +into [classification].cluster_labeling +from $(previous_classification_db_name).[classification].cluster_labeling diff --git a/src/sql/classification/create_input_tables_publicationclassification.sql b/src/sql/classification/create_input_tables_publicationclassification.sql index 90de559..2495ce2 100644 --- a/src/sql/classification/create_input_tables_publicationclassification.sql +++ b/src/sql/classification/create_input_tables_publicationclassification.sql @@ -8,8 +8,8 @@ declare @max_pub_year_core_pub_set int = $(classification_max_pub_year_core_pub_ drop table if exists #pub select a.work_id, pub_no = row_number() over (order by a.work_id) - 1, - core_pub = cast(case when a.pub_year between @min_pub_year_core_pub_set and @max_pub_year_core_pub_set and a.work_type_id in (26, 2) then 1 else 0 end as bit) - -- 26: article, 2: book-chapter + core_pub = cast(case when a.pub_year between @min_pub_year_core_pub_set and @max_pub_year_core_pub_set and a.work_type_id in (26, 34, 2, 32) then 1 else 0 end as bit) + -- 26: article, 34: review, 2: book-chapter, 32: preprint into #pub from $(relational_db_name)..work as a where a.pub_year between @min_pub_year_extended_pub_set and @max_pub_year_extended_pub_set diff --git a/src/sql/core/table_scripts/01_create_tables.sql b/src/sql/core/table_scripts/01_create_tables.sql index 09ab4b5..9f24f40 100644 --- a/src/sql/core/table_scripts/01_create_tables.sql +++ b/src/sql/core/table_scripts/01_create_tables.sql @@ -9,19 +9,19 @@ group by work_id create index idx_tmp_work_n_authors_work_id on #work_n_authors(work_id) -drop table if exists #work_n_author_raw_affiliation_strings -select work_id, n_author_raw_affiliation_strings = count(*) -into #work_n_author_raw_affiliation_strings -from $(relational_db_name)..work_author_raw_affiliation_string +drop table if exists #work_n_affiliations +select work_id, n_affiliations = count(*) +into #work_n_affiliations +from $(relational_db_name)..work_affiliation group by work_id -create index idx_tmp_work_n_author_raw_affiliation_strings_work_id on #work_n_author_raw_affiliation_strings(work_id) +create index idx_tmp_work_n_affiliations_work_id on #work_n_affiliations(work_id) drop table if exists #work_country select work_id, country_code = country_iso_alpha2_code into #work_country -from $(relational_db_name)..work_institution as a +from $(relational_db_name)..work_affiliation_institution as a join $(relational_db_name)..institution as b on a.institution_id = b.institution_id where b.country_iso_alpha2_code is not null union @@ -46,7 +46,7 @@ select a.work_id, a.pub_year, a.language_iso2_code, n_authors = isnull(c.n_authors, 0), - n_author_raw_affiliation_strings = isnull(d.n_author_raw_affiliation_strings, 0), + n_affiliations = isnull(d.n_affiliations, 0), n_countries = isnull(e.n_countries, 0), a.n_refs, a.n_cits, @@ -55,7 +55,7 @@ into #work from $(relational_db_name)..work as a left join $(relational_db_name)..[source] as b on a.source_id = b.source_id left join #work_n_authors as c on a.work_id = c.work_id -left join #work_n_author_raw_affiliation_strings as d on a.work_id = d.work_id +left join #work_n_affiliations as d on a.work_id = d.work_id left join #work_n_countries as e on a.work_id = e.work_id where a.pub_year >= ($(core_min_pub_year_core_pubs) - 5) @@ -63,7 +63,7 @@ where a.pub_year >= ($(core_min_pub_year_core_pubs) - 5) -- Start identification of core publications. --- Step 1: Exclude works that do not have an article work type and a journal or book series source type. +-- Step 1: Exclude works that do not have an article/review work type and a journal source type, a book chapter/article/review work type and a book series source type. drop table if exists #work_step1 select a.* @@ -73,12 +73,12 @@ join $(relational_db_name)..work_type as b on a.work_type_id = b.work_type_id join $(relational_db_name)..source_type as c on a.source_type_id = c.source_type_id where ( - b.work_type = 'article' + b.work_type in ('article', 'review') and c.source_type = 'journal' ) or ( - b.work_type in ('book-chapter', 'article') + b.work_type in ('book-chapter', 'article', 'review') and c.source_type = 'book series' ) @@ -107,7 +107,7 @@ drop table if exists #work_step4 select * into #work_step4 from #work_step3 -where n_author_raw_affiliation_strings > 0 +where n_affiliations > 0 -- Step 5: Exclude works that do not have any references. diff --git a/src/sql/indicators/table_scripts/01_create_tables.sql b/src/sql/indicators/table_scripts/01_create_tables.sql index 7fdc301..7289c4f 100644 --- a/src/sql/indicators/table_scripts/01_create_tables.sql +++ b/src/sql/indicators/table_scripts/01_create_tables.sql @@ -56,8 +56,8 @@ create table doc_type insert doc_type values (1, 'Non-citable item'), -(2, 'Article'), +(2, 'Article / review'), --(3, 'Letter'), -(4, 'Proceeding / chapter') +(4, 'Conference paper / book Chapter') alter table doc_type add constraint pk_doc_type primary key(doc_type_no) diff --git a/src/sql/indicators/table_scripts/02_create_tables.sql b/src/sql/indicators/table_scripts/02_create_tables.sql index ba78807..063e241 100644 --- a/src/sql/indicators/table_scripts/02_create_tables.sql +++ b/src/sql/indicators/table_scripts/02_create_tables.sql @@ -9,19 +9,19 @@ group by work_id create index idx_tmp_pub_n_authors_work_id on #pub_n_authors(work_id) -drop table if exists #pub_n_author_raw_affiliation_strings -select work_id, n_author_raw_affiliation_strings = count(*) -into #pub_n_author_raw_affiliation_strings -from $(relational_db_name)..work_author_raw_affiliation_string +drop table if exists #pub_n_affiliations +select work_id, n_affiliations = count(*) +into #pub_n_affiliations +from $(relational_db_name)..work_affiliation group by work_id -create index idx_tmp_pub_n_author_raw_affiliation_strings_work_id on #pub_n_author_raw_affiliation_strings(work_id) +create index idx_tmp_pub_n_affiliations_work_id on #pub_n_affiliations(work_id) drop table if exists #pub_n_institutions select work_id, n_institutions = count(distinct institution_id) into #pub_n_institutions -from $(relational_db_name)..work_institution +from $(relational_db_name)..work_affiliation_institution where institution_id is not null group by work_id @@ -42,7 +42,7 @@ where country_iso_alpha2_code in ('cn', 'hk', 'mo') drop table if exists #pub_country select a.work_id, country_code = c.cleaned_country_iso_alpha2_code into #pub_country -from $(relational_db_name)..work_institution as a +from $(relational_db_name)..work_affiliation_institution as a join $(relational_db_name)..institution as b on a.institution_id = b.institution_id join #country as c on b.country_iso_alpha2_code = c.country_iso_alpha2_code union @@ -63,7 +63,7 @@ create index idx_tmp_pub_n_countries_work_id on #pub_n_countries(work_id) drop table if exists #pub_industry select distinct a.work_id into #pub_industry -from $(relational_db_name)..work_institution as a +from $(relational_db_name)..work_affiliation_institution as a join $(relational_db_name)..institution as b on a.institution_id = b.institution_id where b.institution_type_id = 2 -- Company. @@ -74,7 +74,7 @@ drop table if exists #pub_oa select a.work_id, is_oa, - is_gold_oa = cast(case when oa_status_id = 3 then 1 else 0 end as bit), + is_gold_oa = cast(case when oa_status_id in (3, 6) then 1 else 0 end as bit), is_hybrid_oa = cast(case when oa_status_id = 5 then 1 else 0 end as bit), is_bronze_oa = cast(case when oa_status_id = 1 then 1 else 0 end as bit), is_green_oa = cast(case when oa_status_id = 4 then 1 when b.work_id is not null then 1 else 0 end as bit) @@ -96,7 +96,7 @@ create index idx_tmp_pub_oa_work_id on #pub_oa(work_id) drop table if exists #pub_coordinates select a.work_id, latitude = b.latitude * pi() / 180, longitude = b.longitude * pi() / 180 into #pub_coordinates -from $(relational_db_name)..work_institution as a +from $(relational_db_name)..work_affiliation_institution as a join $(relational_db_name)..institution as b on a.institution_id = b.institution_id where b.latitude is not null and b.longitude is not null @@ -118,14 +118,14 @@ select a.work_id, work_no = row_number() over (order by a.work_id), doc_type_no = case - when a.work_type_id = 26 /* article */ and b.source_type_id = 3 /* journal */ then 2 -- Article. - when a.work_type_id in (2, 26) /* book-chapter, article */ and b.source_type_id = 5 /* book series */ then 2 -- Article. - when a.work_type_id in (2, 26) /* book-chapter, article */ and b.source_type_id in (1, 2) /* conference, ebook platform */ then 4 -- Proceeding / Chapter. + when a.work_type_id in (26, 34) /* article, review */ and b.source_type_id = 3 /* journal */ then 2 -- Article / review. + when a.work_type_id in (2, 26, 34) /* book-chapter, article, review */ and b.source_type_id = 5 /* book series */ then 2 -- Article / review. + when a.work_type_id in (2, 26, 34) /* book-chapter, article, review */ and b.source_type_id in (1, 2) /* conference, ebook platform */ then 4 -- Conference paper / book Chapter. else 1 end, a.source_id, a.pub_year, - has_required_metadata = cast(case when a.source_id is not null and isnull(d.n_authors, 0) > 0 and isnull(e.n_author_raw_affiliation_strings, 0) > 0 and a.n_refs > 0 then 1 else 0 end as bit), + has_required_metadata = cast(case when a.source_id is not null and isnull(d.n_authors, 0) > 0 and isnull(e.n_affiliations, 0) > 0 and a.n_refs > 0 then 1 else 0 end as bit), n_authors = isnull(d.n_authors, 1), n_institutions = isnull(f.n_institutions, 1), n_countries = isnull(g.n_countries, 1), @@ -141,7 +141,7 @@ into #pub from $(relational_db_name)..work as a left join $(relational_db_name)..[source] as b on a.source_id = b.source_id left join #pub_n_authors as d on a.work_id = d.work_id -left join #pub_n_author_raw_affiliation_strings as e on a.work_id = e.work_id +left join #pub_n_affiliations as e on a.work_id = e.work_id left join #pub_n_institutions as f on a.work_id = f.work_id left join #pub_n_countries as g on a.work_id = g.work_id left join #pub_industry as h on a.work_id = h.work_id diff --git a/src/sql/relational/pre_processing_scripts/03_create_help_table_domain.sql b/src/sql/relational/pre_processing_scripts/03_create_help_table_domain.sql new file mode 100644 index 0000000..e8b7748 --- /dev/null +++ b/src/sql/relational/pre_processing_scripts/03_create_help_table_domain.sql @@ -0,0 +1,23 @@ +set nocount on + +drop table if exists _domain +create table _domain +( + domain_id tinyint not null, + folder varchar(12) not null, + record_id int not null +) +go + +insert into _domain with(tablock) +select + domain_id = replace(id, 'https://openalex.org/domains/', ''), + folder, + record_id +from $(domains_json_db_name)..domain + +alter table _domain add constraint pk_tmp_domain primary key(folder, record_id) +create index idx_tmp_domain_folder on _domain(folder) +create index idx_tmp_domain_record_id on _domain(record_id) +create index idx_tmp_domain_domain_id on _domain(domain_id) +go diff --git a/src/sql/relational/pre_processing_scripts/04_create_help_table_field.sql b/src/sql/relational/pre_processing_scripts/04_create_help_table_field.sql new file mode 100644 index 0000000..7d6e36b --- /dev/null +++ b/src/sql/relational/pre_processing_scripts/04_create_help_table_field.sql @@ -0,0 +1,23 @@ +set nocount on + +drop table if exists _field +create table _field +( + field_id tinyint not null, + folder varchar(12) not null, + record_id int not null +) +go + +insert into _field with(tablock) +select + field_id = replace(id, 'https://openalex.org/fields/', ''), + folder, + record_id +from $(fields_json_db_name)..field + +alter table _field add constraint pk_tmp_field primary key(folder, record_id) +create index idx_tmp_field_folder on _field(folder) +create index idx_tmp_field_record_id on _field(record_id) +create index idx_tmp_field_field_id on _field(field_id) +go diff --git a/src/sql/relational/pre_processing_scripts/04_create_help_table_institution.sql b/src/sql/relational/pre_processing_scripts/05_create_help_table_institution.sql similarity index 100% rename from src/sql/relational/pre_processing_scripts/04_create_help_table_institution.sql rename to src/sql/relational/pre_processing_scripts/05_create_help_table_institution.sql diff --git a/src/sql/relational/pre_processing_scripts/03_create_help_table_funder.sql b/src/sql/relational/pre_processing_scripts/06_create_help_table_funder.sql similarity index 100% rename from src/sql/relational/pre_processing_scripts/03_create_help_table_funder.sql rename to src/sql/relational/pre_processing_scripts/06_create_help_table_funder.sql diff --git a/src/sql/relational/pre_processing_scripts/05_create_help_table_publishers.sql b/src/sql/relational/pre_processing_scripts/07_create_help_table_publishers.sql similarity index 100% rename from src/sql/relational/pre_processing_scripts/05_create_help_table_publishers.sql rename to src/sql/relational/pre_processing_scripts/07_create_help_table_publishers.sql diff --git a/src/sql/relational/pre_processing_scripts/06_create_help_table_sources.sql b/src/sql/relational/pre_processing_scripts/08_create_help_table_sources.sql similarity index 100% rename from src/sql/relational/pre_processing_scripts/06_create_help_table_sources.sql rename to src/sql/relational/pre_processing_scripts/08_create_help_table_sources.sql diff --git a/src/sql/relational/pre_processing_scripts/09_create_help_table_subfield.sql b/src/sql/relational/pre_processing_scripts/09_create_help_table_subfield.sql new file mode 100644 index 0000000..792c081 --- /dev/null +++ b/src/sql/relational/pre_processing_scripts/09_create_help_table_subfield.sql @@ -0,0 +1,23 @@ +set nocount on + +drop table if exists _subfield +create table _subfield +( + subfield_id smallint not null, + folder varchar(12) not null, + record_id int not null +) +go + +insert into _subfield with(tablock) +select + subfield_id = replace(id, 'https://openalex.org/subfields/', ''), + folder, + record_id +from $(subfields_json_db_name)..subfield + +alter table _subfield add constraint pk_tmp_subfield primary key(folder, record_id) +create index idx_tmp_subfield_folder on _subfield(folder) +create index idx_tmp_subfield_record_id on _subfield(record_id) +create index idx_tmp_subfield_subfield_id on _subfield(subfield_id) +go diff --git a/src/sql/relational/pre_processing_scripts/10_create_help_table_topics.sql b/src/sql/relational/pre_processing_scripts/10_create_help_table_topics.sql new file mode 100644 index 0000000..c50d19d --- /dev/null +++ b/src/sql/relational/pre_processing_scripts/10_create_help_table_topics.sql @@ -0,0 +1,23 @@ +set nocount on + +drop table if exists _topic +create table _topic +( + topic_id smallint not null, + folder varchar(12) not null, + record_id int not null +) +go + +insert into _topic with(tablock) +select + topic_id = replace(id, 'https://openalex.org/T', ''), + folder, + record_id +from $(topics_json_db_name)..topic + +alter table _topic add constraint pk_tmp_topic primary key(folder, record_id) +create index idx_tmp_topic_folder on _topic(folder) +create index idx_tmp_topic_record_id on _topic(record_id) +create index idx_tmp_topic_topic_id on _topic(topic_id) +go diff --git a/src/sql/relational/pre_processing_scripts/07_create_help_table_work.sql b/src/sql/relational/pre_processing_scripts/11_create_help_table_work.sql similarity index 100% rename from src/sql/relational/pre_processing_scripts/07_create_help_table_work.sql rename to src/sql/relational/pre_processing_scripts/11_create_help_table_work.sql diff --git a/src/sql/relational/remove_help_tables.sql b/src/sql/relational/remove_help_tables.sql index edd7785..e06e442 100644 --- a/src/sql/relational/remove_help_tables.sql +++ b/src/sql/relational/remove_help_tables.sql @@ -2,8 +2,12 @@ set nocount on drop table if exists _author drop table if exists _concept +drop table if exists _domain +drop table if exists _field drop table if exists _funder drop table if exists _institution drop table if exists _publisher drop table if exists _source +drop table if exists _subfield +drop table if exists _topic drop table if exists _work diff --git a/src/sql/relational/table_scripts/01_author.sql b/src/sql/relational/table_scripts/01_author.sql index 9dc8c9c..0aa524c 100644 --- a/src/sql/relational/table_scripts/01_author.sql +++ b/src/sql/relational/table_scripts/01_author.sql @@ -3,7 +3,6 @@ set nocount on drop table if exists #author select a.author_id, author = b.display_name, - last_known_institution_id = replace(b.last_known_institution_id, 'https://openalex.org/I', ''), orcid = $(etl_db_name).dbo.regex_replace(b.id_orcid, '(https://orcid.org/)|[/]|[ ]', ''), openalex_id = 'A' + cast(a.author_id as varchar(10)), scopus_id = $(etl_db_name).dbo.regex_replace(b.id_scopus, '^(.*author[Ii][Dd]=)(.*?)([0-9]+)(.*?)$', '$3'), @@ -22,7 +21,6 @@ create table author ( author_id bigint not null, author nvarchar(max) null, - last_known_institution_id bigint null, orcid char(19) null, openalex_id varchar(11) not null, scopus_id bigint null, @@ -35,7 +33,6 @@ go insert into author with(tablock) select author_id, author, - last_known_institution_id, case when len(orcid) = 19 then orcid else null end, openalex_id, try_cast(scopus_id as bigint), @@ -54,7 +51,6 @@ select author_id, openalex_id from author alter table author add constraint pk_author primary key(author_id) -create index idx_author_last_known_institution_id on author(last_known_institution_id) create index idx_author_orcid on author(orcid) create index idx_author_openalex_id on author(openalex_id) create index idx_author_scopus_id on author(scopus_id) diff --git a/src/sql/relational/table_scripts/02_concept.sql b/src/sql/relational/table_scripts/02_concept.sql index 7ded84b..247cb15 100644 --- a/src/sql/relational/table_scripts/02_concept.sql +++ b/src/sql/relational/table_scripts/02_concept.sql @@ -167,7 +167,7 @@ drop table if exists concept_international_description create table concept_international_description ( concept_id bigint not null, - language_code varchar(11) not null, + language_code varchar(16) not null, concept_international_description nvarchar(800) not null ) go diff --git a/src/sql/relational/table_scripts/03_domain.sql b/src/sql/relational/table_scripts/03_domain.sql new file mode 100644 index 0000000..2fe6188 --- /dev/null +++ b/src/sql/relational/table_scripts/03_domain.sql @@ -0,0 +1,78 @@ +set nocount on + +-- domain +drop table if exists domain +create table domain +( + domain_id tinyint not null, + domain nvarchar(120) not null, + [description] nvarchar(250) null, + openalex_id tinyint not null, + wikidata_id varchar(10) null, + wikipedia_url varchar(180) null, + updated_date date not null, + created_date datetime2 not null +) +go + +insert into domain with(tablock) +select a.domain_id, + b.display_name, + b.[description], + openalex_id = a.domain_id, + wikidata_id = replace(b.id_wikidata, 'https://www.wikidata.org/wiki/', ''), + wikipedia_url = b.id_wikipedia, + b.updated_date, + b.created_date +from _domain as a +join $(domains_json_db_name)..domain as b on a.folder = b.folder and a.record_id = b.record_id + +alter table domain add constraint pk_domain primary key(domain_id) +create index idx_domain_openalex_id on domain(openalex_id) +create index idx_domain_wikidata_id on domain(wikidata_id) + + + +-- domain_alternative_name +drop table if exists domain_alternative_name +create table domain_alternative_name +( + domain_id tinyint not null, + alternative_name_seq smallint not null, + alternative_name nvarchar(255) not null +) +go + +insert into domain_alternative_name with(tablock) +select a.domain_id, + b.display_name_alternative_seq, + b.display_name_alternative +from _domain as a +join $(domains_json_db_name)..domain_display_name_alternative as b on a.folder = b.folder and a.record_id = b.record_id + +alter table domain_alternative_name add constraint pk_domain_alternative_name primary key(domain_id, alternative_name_seq) +alter table domain_alternative_name add constraint fk_domain_alternative_name_domain_id_domain_domain_id foreign key(domain_id) references domain(domain_id) + + + +-- domain_sibling +drop table if exists domain_sibling +create table domain_sibling +( + domain_id tinyint not null, + sibling_domain_seq smallint not null, + sibling_domain_id tinyint not null +) +go + +insert into domain_sibling with(tablock) +select a.domain_id, + b.sibling_seq, + sibling_domain_id = replace(b.id, 'https://openalex.org/domains/', '') +from _domain as a +join $(domains_json_db_name)..domain_sibling as b on a.folder = b.folder and a.record_id = b.record_id + +alter table domain_sibling add constraint pk_domain_sibling primary key(domain_id, sibling_domain_seq) +create index idx_domain_sibling_sibling_domain_id on domain_sibling(sibling_domain_id) +alter table domain_sibling add constraint fk_domain_sibling_domain_id_domain_domain_id foreign key(domain_id) references domain(domain_id) +alter table domain_sibling add constraint fk_domain_sibling_sibling_domain_id_domain_domain_id foreign key(sibling_domain_id) references domain(domain_id) diff --git a/src/sql/relational/table_scripts/04_field.sql b/src/sql/relational/table_scripts/04_field.sql new file mode 100644 index 0000000..9bbed57 --- /dev/null +++ b/src/sql/relational/table_scripts/04_field.sql @@ -0,0 +1,107 @@ +set nocount on + +-- field +drop table if exists field +create table field +( + field_id tinyint not null, + field nvarchar(120) not null, + [description] nvarchar(250) null, + openalex_id tinyint not null, + wikidata_id varchar(10) null, + wikipedia_url varchar(180) null, + domain_id tinyint not null, + updated_date date not null, + created_date datetime2 not null +) +go + +insert into field with(tablock) +select a.field_id, + b.display_name, + b.[description], + openalex_id = a.field_id, + wikidata_id = replace(b.id_wikidata, 'https://www.wikidata.org/wiki/', ''), + wikipedia_url = b.id_wikipedia, + domain_id = replace(b.domain_id, 'https://openalex.org/domains/', ''), + b.updated_date, + b.created_date +from _field as a +join $(fields_json_db_name)..field as b on a.folder = b.folder and a.record_id = b.record_id + +alter table field add constraint pk_field primary key(field_id) +alter table field add constraint fk_field_domain_id_domain_domain_id foreign key(domain_id) references domain(domain_id) +create index idx_field_openalex_id on field(openalex_id) +create index idx_field_wikidata_id on field(wikidata_id) +create index idx_field_domain_id on field(domain_id) + + + +-- field_alternative_name +drop table if exists field_alternative_name +create table field_alternative_name +( + field_id tinyint not null, + alternative_name_seq smallint not null, + alternative_name nvarchar(255) not null +) +go + +insert into field_alternative_name with(tablock) +select a.field_id, + b.display_name_alternative_seq, + b.display_name_alternative +from _field as a +join $(fields_json_db_name)..field_display_name_alternative as b on a.folder = b.folder and a.record_id = b.record_id + +alter table field_alternative_name add constraint pk_field_alternative_name primary key(field_id, alternative_name_seq) +alter table field_alternative_name add constraint fk_field_alternative_name_field_id_field_field_id foreign key(field_id) references field(field_id) + + + +-- field_sibling +drop table if exists field_sibling +create table field_sibling +( + field_id tinyint not null, + sibling_field_seq smallint not null, + sibling_field_id tinyint not null +) +go + +insert into field_sibling with(tablock) +select a.field_id, + b.sibling_seq, + sibling_field_id = replace(b.id, 'https://openalex.org/fields/', '') +from _field as a +join $(fields_json_db_name)..field_sibling as b on a.folder = b.folder and a.record_id = b.record_id + +alter table field_sibling add constraint pk_field_sibling primary key(field_id, sibling_field_seq) +create index idx_field_sibling_sibling_field_id on field_sibling(sibling_field_id) +alter table field_sibling add constraint fk_field_sibling_field_id_field_field_id foreign key(field_id) references field(field_id) +alter table field_sibling add constraint fk_field_sibling_sibling_field_id_field_field_id foreign key(sibling_field_id) references field(field_id) + + + +-- domain_field +drop table if exists domain_field +create table domain_field +( + domain_id tinyint not null, + field_seq smallint not null, + field_id tinyint not null +) +go + +insert into domain_field with(tablock) +select a.domain_id, + b.field_seq, + field_id = replace(b.id, 'https://openalex.org/fields/', '') +from _domain as a +join $(domains_json_db_name)..domain_field as b on a.folder = b.folder and a.record_id = b.record_id + +alter table domain_field add constraint pk_domain_field primary key(domain_id, field_seq) +create index idx_domain_field_field_id on domain_field(field_id) +alter table domain_field add constraint fk_domain_field_domain_id_domain_domain_id foreign key(domain_id) references domain(domain_id) +alter table domain_field add constraint fk_domain_field_field_id_field_field_id foreign key(field_id) references field(field_id) + diff --git a/src/sql/relational/table_scripts/04_funder.sql b/src/sql/relational/table_scripts/05_funder.sql similarity index 100% rename from src/sql/relational/table_scripts/04_funder.sql rename to src/sql/relational/table_scripts/05_funder.sql diff --git a/src/sql/relational/table_scripts/03_institution.sql b/src/sql/relational/table_scripts/06_institution.sql similarity index 78% rename from src/sql/relational/table_scripts/03_institution.sql rename to src/sql/relational/table_scripts/06_institution.sql index a5e9d72..9586c61 100644 --- a/src/sql/relational/table_scripts/03_institution.sql +++ b/src/sql/relational/table_scripts/06_institution.sql @@ -189,6 +189,7 @@ create table institution latitude float null, longitude float null, homepage_url varchar(600) null, + is_super_system bit null, ror_id varchar(9) null, grid_id varchar(13) null, openalex_id varchar(11) not null, @@ -212,6 +213,7 @@ select a.institution_id, b.geo_latitude, b.geo_longitude, b.homepage_url, + is_super_system = case when b.is_super_system = 'true' then 1 when b.is_super_system = 'false' then 0 else null end, ror_id = replace(b.id_ror, 'https://ror.org/', ''), grid_id = b.id_grid, openalex_id = 'I' + cast(a.institution_id as varchar(10)), @@ -248,24 +250,36 @@ from #institution_from_source as a left join institution as b on a.institution_id = b.institution_id where b.institution_id is null -drop table if exists #institution_from_work -select institution_id, openalex_id, institution -into #institution_from_work -from -( - select institution_id = replace(id, 'https://openalex.org/I', ''), - openalex_id = replace(id, 'https://openalex.org/', ''), - institution = display_name, - [filter] = row_number() over (partition by id order by count(*) desc, display_name) - from $(works_json_db_name)..work_authorship_institution - where patindex('https://openalex.org/I%', id) > 0 - group by id, display_name -) as a -where [filter] = 1 +drop table if exists #institution_from_work1 +select distinct institution_id = replace(institution_id, 'https://openalex.org/I', ''), openalex_id = replace(institution_id, 'https://openalex.org/', '') +into #institution_from_work1 +from $(works_json_db_name)..work_authorship_affiliation_institution_id -insert into institution with(tablock) (institution_id, openalex_id, institution) -select a.institution_id, a.openalex_id, a.institution -from #institution_from_work as a +insert into institution with(tablock) (institution_id, openalex_id) +select a.institution_id, a.openalex_id +from #institution_from_work1 as a +left join institution as b on a.institution_id = b.institution_id +where b.institution_id is null + +drop table if exists #institution_from_work2 +select distinct institution_id = replace(id, 'https://openalex.org/I', ''), openalex_id = replace(id, 'https://openalex.org/', '') +into #institution_from_work2 +from $(works_json_db_name)..work_authorship_institution + +insert into institution with(tablock) (institution_id, openalex_id) +select a.institution_id, a.openalex_id +from #institution_from_work2 as a +left join institution as b on a.institution_id = b.institution_id +where b.institution_id is null + +drop table if exists #institution_from_author +select distinct institution_id = replace(institution_id, 'https://openalex.org/I', ''), openalex_id = replace(institution_id, 'https://openalex.org/', '') +into #institution_from_author +from $(authors_json_db_name)..author_affiliation + +insert into institution with(tablock) (institution_id, openalex_id) +select a.institution_id, a.openalex_id +from #institution_from_author as a left join institution as b on a.institution_id = b.institution_id where b.institution_id is null @@ -407,7 +421,7 @@ drop table if exists institution_international_name create table institution_international_name ( institution_id bigint not null, - language_code varchar(11) not null, + language_code varchar(16) not null, institution_international_name nvarchar(200) not null ) go @@ -474,3 +488,74 @@ alter table institution_lineage add constraint pk_institution_lineage primary ke create index idx_institution_lineage_lineage_institution_id on institution_lineage(lineage_institution_id) alter table institution_lineage add constraint fk_institution_lineage_institution_id_institution_institution_id foreign key(institution_id) references institution(institution_id) alter table institution_lineage add constraint fk_institution_lineage_lineage_institution_id_institution_institution_id foreign key(lineage_institution_id) references institution(institution_id) + + + +-- author_institution +drop table if exists author_institution +create table author_institution +( + author_id bigint not null, + institution_seq smallint not null, + institution_id bigint not null +) +go + +insert into author_institution with(tablock) +select a.author_id, + institution_seq = b.affiliation_seq, + institution_id = replace(b.institution_id, 'https://openalex.org/I', '') +from _author as a +join $(authors_json_db_name)..author_affiliation as b on a.folder = b.folder and a.record_id = b.record_id + +alter table author_institution add constraint pk_author_institution primary key(author_id, institution_seq) +alter table author_institution add constraint fk_author_institution_author_id_author_author_id foreign key(author_id) references author(author_id) +alter table author_institution add constraint fk_author_institution_institution_id_institution_institution_id foreign key(institution_id) references institution(institution_id) + + + +-- author_institution_year +drop table if exists author_institution_year +create table author_institution_year +( + author_id bigint not null, + institution_seq smallint not null, + year_seq smallint not null, + [year] smallint not null +) +go + +insert into author_institution_year with(tablock) +select a.author_id, + institution_seq = b.affiliation_seq, + b.year_seq, + b.[year] +from _author as a +join $(authors_json_db_name)..author_affiliation_year as b on a.folder = b.folder and a.record_id = b.record_id + +alter table author_institution_year add constraint pk_author_institution_year primary key(author_id, institution_seq, year_seq) +alter table author_institution_year add constraint fk_author_institution_year_author_id_author_author_id foreign key(author_id) references author(author_id) +alter table author_institution_year add constraint fk_author_institution_year_author_id_author_institution_author_id_institution_seq foreign key(author_id, institution_seq) references author_institution(author_id, institution_seq) + + + +-- author_last_known_institution +drop table if exists author_last_known_institution +create table author_last_known_institution +( + author_id bigint not null, + last_known_institution_seq smallint not null, + last_known_institution_id bigint not null +) +go + +insert into author_last_known_institution with(tablock) +select a.author_id, + last_known_institution_seq = b.affiliation_seq, + last_known_institution_id = replace(b.institution_id, 'https://openalex.org/I', '') +from _author as a +join $(authors_json_db_name)..author_affiliation as b on a.folder = b.folder and a.record_id = b.record_id + +alter table author_last_known_institution add constraint pk_author_last_known_institution primary key(author_id, last_known_institution_seq) +alter table author_last_known_institution add constraint fk_author_last_known_institution_author_id_author_author_id foreign key(author_id) references author(author_id) +alter table author_last_known_institution add constraint fk_author_last_known_institution_institution_id_institution_institution_id foreign key(last_known_institution_id) references institution(institution_id) diff --git a/src/sql/relational/table_scripts/06_institution_funder_publisher.sql b/src/sql/relational/table_scripts/07_institution_funder_publisher.sql similarity index 100% rename from src/sql/relational/table_scripts/06_institution_funder_publisher.sql rename to src/sql/relational/table_scripts/07_institution_funder_publisher.sql diff --git a/src/sql/relational/table_scripts/05_publisher.sql b/src/sql/relational/table_scripts/08_publisher.sql similarity index 100% rename from src/sql/relational/table_scripts/05_publisher.sql rename to src/sql/relational/table_scripts/08_publisher.sql diff --git a/src/sql/relational/table_scripts/07_source.sql b/src/sql/relational/table_scripts/09_source.sql similarity index 98% rename from src/sql/relational/table_scripts/07_source.sql rename to src/sql/relational/table_scripts/09_source.sql index 845daf1..be38487 100644 --- a/src/sql/relational/table_scripts/07_source.sql +++ b/src/sql/relational/table_scripts/09_source.sql @@ -193,6 +193,7 @@ join $(sources_json_db_name)..source_society as b on a.folder = b.folder and a.r alter table source_society add constraint pk_source_society primary key(source_id, society_seq) create index idx_source_society_society on source_society(society) +alter table source_society add constraint fk_source_society_source_id_source_source_id foreign key(source_id) references [source](source_id) diff --git a/src/sql/relational/table_scripts/10_subfield.sql b/src/sql/relational/table_scripts/10_subfield.sql new file mode 100644 index 0000000..5ea008e --- /dev/null +++ b/src/sql/relational/table_scripts/10_subfield.sql @@ -0,0 +1,110 @@ +set nocount on + +-- subfield +drop table if exists subfield +create table subfield +( + subfield_id smallint not null, + subfield nvarchar(120) not null, + [description] nvarchar(250) null, + openalex_id smallint not null, + wikidata_id varchar(10) null, + wikipedia_url varchar(180) null, + domain_id tinyint not null, + field_id tinyint not null, + updated_date date not null, + created_date datetime2 not null +) +go + +insert into subfield with(tablock) +select a.subfield_id, + b.display_name, + b.[description], + openalex_id = a.subfield_id, + wikidata_id = replace(replace(b.id_wikidata, 'https://www.wikidata.org/wiki/', ''), 'http://www.wikidata.org/entity/', ''), + wikipedia_url = b.id_wikipedia, + domain_id = replace(b.domain_id, 'https://openalex.org/domains/', ''), + field_id = replace(b.field_id, 'https://openalex.org/fields/', ''), + b.updated_date, + b.created_date +from _subfield as a +join $(subfields_json_db_name)..subfield as b on a.folder = b.folder and a.record_id = b.record_id + +alter table subfield add constraint pk_subfield primary key(subfield_id) +alter table subfield add constraint fk_subfield_domain_id_domain_domain_id foreign key(domain_id) references domain(domain_id) +alter table subfield add constraint fk_subfield_field_id_field_field_id foreign key(field_id) references field(field_id) +create index idx_subfield_openalex_id on subfield(openalex_id) +create index idx_subfield_wikidata_id on subfield(wikidata_id) +create index idx_subfield_domain_id on subfield(domain_id) +create index idx_subfield_field_id on subfield(field_id) + + + +-- subfield_alternative_name +drop table if exists subfield_alternative_name +create table subfield_alternative_name +( + subfield_id smallint not null, + alternative_name_seq smallint not null, + alternative_name nvarchar(255) not null +) +go + +insert into subfield_alternative_name with(tablock) +select a.subfield_id, + b.display_name_alternative_seq, + b.display_name_alternative +from _subfield as a +join $(subfields_json_db_name)..subfield_display_name_alternative as b on a.folder = b.folder and a.record_id = b.record_id + +alter table subfield_alternative_name add constraint pk_subfield_alternative_name primary key(subfield_id, alternative_name_seq) +alter table subfield_alternative_name add constraint fk_subfield_alternative_name_subfield_id_subfield_subfield_id foreign key(subfield_id) references subfield(subfield_id) + + + +-- subfield_sibling +drop table if exists subfield_sibling +create table subfield_sibling +( + subfield_id smallint not null, + sibling_subfield_seq smallint not null, + sibling_subfield_id smallint not null +) +go + +insert into subfield_sibling with(tablock) +select a.subfield_id, + b.sibling_seq, + sibling_subfield_id = replace(b.id, 'https://openalex.org/subfields/', '') +from _subfield as a +join $(subfields_json_db_name)..subfield_sibling as b on a.folder = b.folder and a.record_id = b.record_id + +alter table subfield_sibling add constraint pk_subfield_sibling primary key(subfield_id, sibling_subfield_seq) +create index idx_subfield_sibling_sibling_subfield_id on subfield_sibling(sibling_subfield_id) +alter table subfield_sibling add constraint fk_subfield_sibling_subfield_id_subfield_subfield_id foreign key(subfield_id) references subfield(subfield_id) +alter table subfield_sibling add constraint fk_subfield_sibling_sibling_subfield_id_subfield_subfield_id foreign key(sibling_subfield_id) references subfield(subfield_id) + + + +-- field_subfield +drop table if exists field_subfield +create table field_subfield +( + field_id tinyint not null, + subfield_seq smallint not null, + subfield_id smallint not null +) +go + +insert into field_subfield with(tablock) +select a.field_id, + b.subfield_seq, + subfield_id = replace(b.id, 'https://openalex.org/subfields/', '') +from _field as a +join $(fields_json_db_name)..field_subfield as b on a.folder = b.folder and a.record_id = b.record_id + +alter table field_subfield add constraint pk_field_subfield primary key(field_id, subfield_seq) +create index idx_field_subfield_subfield_id on field_subfield(subfield_id) +alter table field_subfield add constraint fk_field_subfield_field_id_field_field_id foreign key(field_id) references field(field_id) +alter table field_subfield add constraint fk_field_subfield_subfield_id_subfield_subfield_id foreign key(subfield_id) references subfield(subfield_id) diff --git a/src/sql/relational/table_scripts/11_topic.sql b/src/sql/relational/table_scripts/11_topic.sql new file mode 100644 index 0000000..8609deb --- /dev/null +++ b/src/sql/relational/table_scripts/11_topic.sql @@ -0,0 +1,111 @@ +set nocount on + +-- topic +drop table if exists topic +create table topic +( + topic_id smallint not null, + topic nvarchar(120) not null, + [description] nvarchar(1000) null, + openalex_id smallint not null, + wikipedia_url varchar(180) null, + domain_id tinyint not null, + field_id tinyint not null, + subfield_id smallint not null, + updated_date date not null, + created_date datetime2 not null +) +go + +insert into topic with(tablock) +select a.topic_id, + b.display_name, + b.[description], + openalex_id = a.topic_id, + wikipedia_url = b.id_wikipedia, + domain_id = replace(b.domain_id, 'https://openalex.org/domains/', ''), + field_id = replace(b.field_id, 'https://openalex.org/fields/', ''), + subfield_id = replace(b.subfield_id, 'https://openalex.org/subfields/', ''), + b.updated_date, + b.created_date +from _topic as a +join $(topics_json_db_name)..topic as b on a.folder = b.folder and a.record_id = b.record_id + +alter table topic add constraint pk_topic primary key(topic_id) +alter table topic add constraint fk_topic_domain_id_domain_domain_id foreign key(domain_id) references domain(domain_id) +alter table topic add constraint fk_topic_field_id_field_field_id foreign key(field_id) references field(field_id) +alter table topic add constraint fk_topic_subfield_id_subfield_subfield_id foreign key(subfield_id) references subfield(subfield_id) +create index idx_topic_openalex_id on topic(openalex_id) +create index idx_topic_domain_id on topic(domain_id) +create index idx_topic_field_id on topic(field_id) +create index idx_topic_subfield_id on topic(subfield_id) + + + +-- topic_keyword +drop table if exists topic_keyword +create table topic_keyword +( + topic_id smallint not null, + keyword_seq smallint not null, + keyword nvarchar(100) not null +) +go + +insert into topic_keyword with(tablock) +select a.topic_id, + b.keyword_seq, + b.keyword +from _topic as a +join $(topics_json_db_name)..topic_keyword as b on a.folder = b.folder and a.record_id = b.record_id + +alter table topic_keyword add constraint pk_topic_keyword primary key(topic_id, keyword_seq) +alter table topic_keyword add constraint fk_topic_keyword_topic_id_topic_topic_id foreign key(topic_id) references topic(topic_id) + + + +-- topic_sibling +drop table if exists topic_sibling +create table topic_sibling +( + topic_id smallint not null, + sibling_topic_seq smallint not null, + sibling_topic_id smallint not null +) +go + +insert into topic_sibling with(tablock) +select a.topic_id, + b.sibling_seq, + sibling_topic_id = replace(b.id, 'https://openalex.org/T', '') +from _topic as a +join $(topics_json_db_name)..topic_sibling as b on a.folder = b.folder and a.record_id = b.record_id + +alter table topic_sibling add constraint pk_topic_sibling primary key(topic_id, sibling_topic_seq) +create index idx_topic_sibling_sibling_topic_id on topic_sibling(sibling_topic_id) +alter table topic_sibling add constraint fk_topic_sibling_topic_id_topic_topic_id foreign key(topic_id) references topic(topic_id) +alter table topic_sibling add constraint fk_topic_sibling_sibling_topic_id_topic_topic_id foreign key(sibling_topic_id) references topic(topic_id) + + + +-- subfield_topic +drop table if exists subfield_topic +create table subfield_topic +( + subfield_id smallint not null, + topic_seq smallint not null, + topic_id smallint not null +) +go + +insert into subfield_topic with(tablock) +select a.subfield_id, + b.topic_seq, + topic_id = replace(b.id, 'https://openalex.org/T', '') +from _subfield as a +join $(subfields_json_db_name)..subfield_topic as b on a.folder = b.folder and a.record_id = b.record_id + +alter table subfield_topic add constraint pk_subfield_topic primary key(subfield_id, topic_seq) +create index idx_subfield_topic_topic_id on subfield_topic(topic_id) +alter table subfield_topic add constraint fk_subfield_topic_subfield_id_subfield_subfield_id foreign key(subfield_id) references subfield(subfield_id) +alter table subfield_topic add constraint fk_subfield_topic_topic_id_topic_topic_id foreign key(topic_id) references topic(topic_id) diff --git a/src/sql/relational/table_scripts/08_work.sql b/src/sql/relational/table_scripts/12_work.sql similarity index 88% rename from src/sql/relational/table_scripts/08_work.sql rename to src/sql/relational/table_scripts/12_work.sql index 31dd321..9b18d3a 100644 --- a/src/sql/relational/table_scripts/08_work.sql +++ b/src/sql/relational/table_scripts/12_work.sql @@ -99,6 +99,8 @@ begin print @n_works_best_oa_location2 end +update #work_location3 set license = 'cc-by' where license = 'cc-by-4.0' + -- work_type @@ -106,7 +108,7 @@ drop table if exists work_type create table work_type ( work_type_id tinyint not null identity(1, 1), - work_type varchar(20) not null + work_type varchar(25) not null ) go @@ -152,7 +154,7 @@ drop table if exists oa_status create table oa_status ( oa_status_id tinyint not null identity(1, 1), - oa_status varchar(6) not null + oa_status varchar(10) not null ) go @@ -488,7 +490,7 @@ end insert into license with(tablock) select license -from $(works_json_db_name)..work_location +from #work_location3 where license is not null except select license @@ -563,11 +565,15 @@ go insert into work_title with(tablock) select a.work_id, - trim($(etl_db_name).dbo.remove_html_tags($(etl_db_name).dbo.decode_html_characters2(b.title))) + trim($(etl_db_name).dbo.regex_replace($(etl_db_name).dbo.remove_html_tags($(etl_db_name).dbo.decode_html_characters2(b.title)), '[\r\n\t]', ' ')) from _work as a join $(works_json_db_name)..work as b on a.folder = b.folder and a.record_id = b.record_id where b.title is not null +delete a with(tablock) +from work_title as a +where len(a.title) = 0 + alter table work_title add constraint pk_work_title primary key(work_id) alter table work_title add constraint fk_work_title_work_id_work_work_id foreign key(work_id) references work(work_id) @@ -584,11 +590,15 @@ go insert into work_abstract with(tablock) select a.work_id, - trim($(etl_db_name).dbo.remove_html_tags($(etl_db_name).dbo.decode_html_characters2(b.abstract))) + trim($(etl_db_name).dbo.regex_replace($(etl_db_name).dbo.remove_html_tags($(etl_db_name).dbo.decode_html_characters2(b.abstract)), '[\r\n\t]', ' ')) from _work as a join $(works_json_db_name)..work_abstract as b on a.folder = b.folder and a.record_id = b.record_id where b.abstract is not null +delete a with(tablock) +from work_abstract as a +where len(a.abstract) = 0 + alter table work_abstract add constraint pk_work_abstract primary key(work_id) alter table work_abstract add constraint fk_work_abstract_work_id_work_work_id foreign key(work_id) references work(work_id) @@ -888,35 +898,35 @@ drop table if exists keyword create table keyword ( keyword_id int not null identity(1, 1), - keyword nvarchar(1200) not null, + keyword nvarchar(200) not null, ) go -if exists (select * from master.dbo.sysdatabases where name = '$(previous_relational_db_name)') -begin - if exists (select * from $(previous_relational_db_name).sys.tables where [name] = 'keyword') - begin - set identity_insert keyword on +--if exists (select * from master.dbo.sysdatabases where name = '$(previous_relational_db_name)') +--begin +-- if exists (select * from $(previous_relational_db_name).sys.tables where [name] = 'keyword') +-- begin +-- set identity_insert keyword on - insert into keyword with(tablock) (keyword_id, keyword) - select keyword_id, keyword - from $(previous_relational_db_name)..keyword +-- insert into keyword with(tablock) (keyword_id, keyword) +-- select keyword_id, keyword +-- from $(previous_relational_db_name)..keyword - set identity_insert keyword off - end -end +-- set identity_insert keyword off +-- end +--end insert into keyword with(tablock) -select keyword +select keyword = display_name from $(works_json_db_name)..work_keyword -where keyword is not null +where display_name is not null except select keyword from keyword order by keyword alter table keyword add constraint pk_keyword primary key(keyword_id) ---create index idx_keyword_keyword on keyword(keyword) +create index idx_keyword_keyword on keyword(keyword) @@ -938,9 +948,100 @@ select a.work_id, b.score from _work as a join $(works_json_db_name)..work_keyword as b on a.folder = b.folder and a.record_id = b.record_id -join keyword as c on b.keyword = c.keyword +join keyword as c on b.display_name = c.keyword alter table work_keyword add constraint pk_work_keyword primary key(work_id, keyword_seq) create index idx_work_keyword_keyword_id on work_keyword(keyword_id) alter table work_keyword add constraint fk_work_keyword_work_id_work_work_id foreign key(work_id) references work(work_id) alter table work_keyword add constraint fk_work_keyword_keyword_id_keyword_keyword_id foreign key(keyword_id) references keyword(keyword_id) + + + +-- work_topic +drop table if exists work_topic +create table work_topic +( + work_id bigint not null, + topic_seq smallint not null, + topic_id smallint not null, + score float not null, + is_primary_topic bit not null +) +go + +insert into work_topic with(tablock) +select a.work_id, + b.topic_seq, + topic_id = replace(b.id, 'https://openalex.org/T', ''), + b.score, + is_primary_topic = case when b.topic_seq = 1 then 1 else 0 end +from _work as a +join $(works_json_db_name)..work_topic as b on a.folder = b.folder and a.record_id = b.record_id + +alter table work_topic add constraint pk_work_topic primary key(work_id, topic_seq) +create index idx_work_topic_topic_id on work_topic(topic_id) +create index idx_work_topic_is_primary_topic on work_topic(is_primary_topic) +alter table work_topic add constraint fk_work_topic_work_id_work_work_id foreign key(work_id) references work(work_id) +alter table work_topic add constraint fk_work_topic_topic_id_topic_topic_id foreign key(topic_id) references topic(topic_id) + + + +-- data_source +drop table if exists [data_source] +create table [data_source] +( + data_source_id int not null identity(1, 1), + [data_source] varchar(20) not null, +) +go + +--if exists (select * from master.dbo.sysdatabases where name = '$(previous_relational_db_name)') +--begin +-- if exists (select * from $(previous_relational_db_name).sys.tables where [name] = 'data_source') +-- begin +-- set identity_insert [data_source] on + +-- insert into [data_source] with(tablock) (data_source_id, [data_source]) +-- select data_source_id, [data_source] +-- from $(previous_relational_db_name)..[data_source] + +-- set identity_insert [data_source] off +-- end +--end + +insert into [data_source] with(tablock) +select [data_source] = indexed_in +from $(works_json_db_name)..work_indexed_in +where indexed_in is not null +except +select [data_source] +from [data_source] +order by [data_source] + +alter table [data_source] add constraint pk_data_source primary key(data_source_id) +create index idx_data_source_data_source on [data_source](data_source) + + + +-- work_data_source +drop table if exists work_data_source +create table work_data_source +( + work_id bigint not null, + data_source_seq smallint not null, + data_source_id int not null +) +go + +insert into work_data_source with(tablock) +select a.work_id, + data_source_seq = b.indexed_in_seq, + c.data_source_id +from _work as a +join $(works_json_db_name)..work_indexed_in as b on a.folder = b.folder and a.record_id = b.record_id +join [data_source] as c on b.indexed_in = c.[data_source] + +alter table work_data_source add constraint pk_work_data_source primary key(work_id, data_source_seq) +create index idx_work_data_source_data_source_id on work_data_source(data_source_id) +alter table work_data_source add constraint fk_work_data_source_work_id_work_work_id foreign key(work_id) references work(work_id) +alter table work_data_source add constraint fk_work_data_source_data_source_id_data_source_data_source_id foreign key(data_source_id) references [data_source](data_source_id) diff --git a/src/sql/relational/table_scripts/09_work_author_institution.sql b/src/sql/relational/table_scripts/13_work_author_affiliation_institution.sql similarity index 52% rename from src/sql/relational/table_scripts/09_work_author_institution.sql rename to src/sql/relational/table_scripts/13_work_author_affiliation_institution.sql index 9822894..f684f2f 100644 --- a/src/sql/relational/table_scripts/09_work_author_institution.sql +++ b/src/sql/relational/table_scripts/13_work_author_affiliation_institution.sql @@ -6,37 +6,121 @@ select a.work_id, author_id = replace(b.author_id, 'https://openalex.org/A', ''), b.author_position, b.is_corresponding, - raw_author_name = cast(left(b.raw_author_name, 800) as nvarchar(800)), - raw_affiliation_string = cast(left(b.raw_affiliation_string, 800) as nvarchar(800)) + raw_author_name = cast(left(b.raw_author_name, 800) as nvarchar(800)) into #work_author from _work as a join $(works_json_db_name)..work_authorship as b on a.folder = b.folder and a.record_id = b.record_id create clustered columnstore index idx_tmp_work_author on #work_author + + drop table if exists #work_author_raw_affiliation_string select a.work_id, author_seq = b.authorship_seq, b.raw_affiliation_string_seq, - raw_affiliation_string = cast(left(b.raw_affiliation_string, 800) as nvarchar(800)) + raw_affiliation_string = cast($(etl_db_name).dbo.regex_replace(left(b.raw_affiliation_string, 800), '^\s*|\s*$', '') as nvarchar(800)) into #work_author_raw_affiliation_string from _work as a join $(works_json_db_name)..work_authorship_raw_affiliation_string as b on a.folder = b.folder and a.record_id = b.record_id create clustered columnstore index idx_tmp_work_author_raw_affiliation_string on #work_author_raw_affiliation_string -declare @n_records_with_missing_raw_affiliation_strings as int +delete from #work_author_raw_affiliation_string with(tablock) +where len(raw_affiliation_string) = 0 -select @n_records_with_missing_raw_affiliation_strings = count(*) -from #work_author as a -left join #work_author_raw_affiliation_string as b on a.work_id = b.work_id and a.author_seq = b.author_seq -where a.raw_affiliation_string is not null - and b.work_id is null +delete from #work_author_raw_affiliation_string with(tablock) +where raw_affiliation_string is null + +drop table if exists #work_author_affiliation +select a.work_id, + author_seq = b.authorship_seq, + author_affiliation_seq = b.affiliation_seq, + raw_affiliation_string = cast($(etl_db_name).dbo.regex_replace(left(b.raw_affiliation_string, 800), '^\s*|\s*$', '') as nvarchar(800)) +into #work_author_affiliation +from _work as a +join $(works_json_db_name)..work_authorship_affiliation as b on a.folder = b.folder and a.record_id = b.record_id + +create clustered columnstore index idx_tmp_work_author_affiliation on #work_author_affiliation + +delete from #work_author_affiliation with(tablock) +where len(raw_affiliation_string) = 0 + +delete from #work_author_affiliation with(tablock) +where raw_affiliation_string is null + +declare @n_missing_raw_affiliation_strings as int + +select @n_missing_raw_affiliation_strings = count(*) +from #work_author_raw_affiliation_string as a +left join #work_author_affiliation as b on a.work_id = b.work_id and a.author_seq = b.author_seq and a.raw_affiliation_string = b.raw_affiliation_string +where b.work_id is null -if @n_records_with_missing_raw_affiliation_strings > 0 +if @n_missing_raw_affiliation_strings > 0 begin - raiserror('Info: Check work_author_raw_affiliation_string.', 2, 1) - print 'Number of work-author records with missing missing_raw_affiliation_string data: ' + cast(@n_records_with_missing_raw_affiliation_strings as varchar(10)) + raiserror('Info: Check work_authorship_affiliation.', 2, 1) + print 'Number of missing raw_affiliation_strings in work_authorship_affiliation: ' + cast(@n_missing_raw_affiliation_strings as varchar(10)) +end + +select @n_missing_raw_affiliation_strings = count(*) +from #work_author_affiliation as a +left join #work_author_raw_affiliation_string as b on a.work_id = b.work_id and a.author_seq = b.author_seq and a.raw_affiliation_string = b.raw_affiliation_string +where b.work_id is null + +if @n_missing_raw_affiliation_strings > 0 +begin + raiserror('Info: Check work_authorship_raw_affiliation_string.', 2, 1) + print 'Number of missing raw_affiliation_strings in work_authorship_raw_affiliation_string: ' + cast(@n_missing_raw_affiliation_strings as varchar(10)) +end + + + +drop table if exists #work_author_institution +select a.work_id, + author_seq = b.authorship_seq, + author_institution_seq = c.institution_seq, + institution_id = replace(c.id, 'https://openalex.org/I', '') +into #work_author_institution +from _work as a +join $(works_json_db_name)..work_authorship as b on a.folder = b.folder and a.record_id = b.record_id +join $(works_json_db_name)..work_authorship_institution as c on a.folder = c.folder and a.record_id = c.record_id and b.authorship_seq = c.authorship_seq + +create clustered columnstore index idx_tmp_work_author_institution on #work_author_institution + +drop table if exists #work_author_affiliation_institution +select a.work_id, + author_seq = b.authorship_seq, + author_affiliation_seq = b.affiliation_seq, + author_affiliation_institution_seq = b.institution_id_seq, + institution_id = replace(b.institution_id, 'https://openalex.org/I', '') +into #work_author_affiliation_institution +from _work as a +join $(works_json_db_name)..work_authorship_affiliation_institution_id as b on a.folder = b.folder and a.record_id = b.record_id + +create clustered columnstore index idx_tmp_work_author_affiliation_institution on #work_author_affiliation_institution + +declare @n_missing_institution_ids as int + +select @n_missing_institution_ids = count(*) +from #work_author_institution as a +left join #work_author_affiliation_institution as b on a.work_id = b.work_id and a.author_seq = b.author_seq and a.institution_id = b.institution_id +where b.work_id is null + +if @n_missing_institution_ids > 0 +begin + raiserror('Info: Check work_authorship_affiliation_institution_id.', 2, 1) + print 'Number of missing institution_ids in work_authorship_affiliation_institution_id: ' + cast(@n_missing_institution_ids as varchar(10)) +end + +select @n_missing_institution_ids = count(*) +from #work_author_affiliation_institution as a +left join #work_author_institution as b on a.work_id = b.work_id and a.author_seq = b.author_seq and a.institution_id = b.institution_id +where b.work_id is null + +if @n_missing_institution_ids > 0 +begin + raiserror('Info: Check work_authorship_institution.', 2, 1) + print 'Number of missing institution_ids in work_authorship_institution: ' + cast(@n_missing_institution_ids as varchar(10)) end @@ -156,6 +240,15 @@ begin end end +insert into raw_affiliation_string with(tablock) +select raw_affiliation_string +from #work_author_affiliation +where raw_affiliation_string is not null +except +select raw_affiliation_string +from raw_affiliation_string +order by raw_affiliation_string + insert into raw_affiliation_string with(tablock) select raw_affiliation_string from #work_author_raw_affiliation_string @@ -233,143 +326,99 @@ alter table work_author_country add constraint fk_work_author_country_country_is --- work_author_raw_affiliation_string -drop table if exists work_author_raw_affiliation_string -create table work_author_raw_affiliation_string -( - work_id bigint not null, - author_seq smallint not null, - raw_affiliation_string_seq smallint not null, - raw_affiliation_string_id int not null -) -go - -insert into work_author_raw_affiliation_string with(tablock) +drop table if exists #work_author_affiliation2 select a.work_id, a.author_seq, - a.raw_affiliation_string_seq, + a.author_affiliation_seq, + author_affiliation_seq2 = row_number() over (partition by work_id order by author_seq, author_affiliation_seq), b.raw_affiliation_string_id -from #work_author_raw_affiliation_string as a +into #work_author_affiliation2 +from #work_author_affiliation as a join raw_affiliation_string as b on a.raw_affiliation_string = b.raw_affiliation_string -alter table work_author_raw_affiliation_string add constraint pk_work_author_raw_affiliation_string primary key(work_id, author_seq, raw_affiliation_string_seq) -create index idx_work_author_raw_affiliation_string_raw_affiliation_string_id on work_author_raw_affiliation_string(raw_affiliation_string_id) -alter table work_author_raw_affiliation_string add constraint fk_work_author_raw_affiliation_string_work_id_work_work_id foreign key(work_id) references work(work_id) -alter table work_author_raw_affiliation_string add constraint fk_work_author_raw_affiliation_string_work_id_author_seq_work_author_work_id_author_seq foreign key(work_id, author_seq) references work_author(work_id, author_seq) -alter table work_author_raw_affiliation_string add constraint fk_work_author_raw_affiliation_string_raw_affiliation_string_id_raw_affiliation_string_raw_affiliation_string_id foreign key(raw_affiliation_string_id) references raw_affiliation_string(raw_affiliation_string_id) +-- work_affiliation +drop table if exists work_affiliation +create table work_affiliation +( + work_id bigint not null, + affiliation_seq smallint not null, + raw_affiliation_string_id int not null +) +go -drop table if exists #work_author_institution -select a.work_id, - author_institution_seq = row_number() over (partition by a.work_id order by b.authorship_seq, c.institution_seq), - author_seq = b.authorship_seq, - institution_seq_org = c.institution_seq, - institution_id = replace(c.id, 'https://openalex.org/I', ''), - institution_name = case when c.id is null then cast(left(c.display_name, 800) as nvarchar(800)) else null end -into #work_author_institution -from _work as a -join $(works_json_db_name)..work_authorship as b on a.folder = b.folder and a.record_id = b.record_id -join $(works_json_db_name)..work_authorship_institution as c on a.folder = c.folder and a.record_id = c.record_id and b.authorship_seq = c.authorship_seq +insert into work_affiliation with(tablock) +select work_id, + affiliation_seq = row_number() over (partition by work_id order by min(author_affiliation_seq2)), + raw_affiliation_string_id +from #work_author_affiliation2 +group by work_id, raw_affiliation_string_id + +alter table work_affiliation add constraint pk_work_affiliation primary key(work_id, affiliation_seq) +create index idx_work_affiliation_raw_affiliation_string_id on work_affiliation(raw_affiliation_string_id) +alter table work_affiliation add constraint fk_work_affiliation_work_id_work_work_id foreign key(work_id) references work(work_id) +alter table work_affiliation add constraint fk_work_affiliation_raw_affiliation_string_id_raw_affiliation_string_raw_affiliation_string_id foreign key(raw_affiliation_string_id) references raw_affiliation_string(raw_affiliation_string_id) --- institution_name -drop table if exists institution_name -create table institution_name +-- work_author_affiliation +drop table if exists work_author_affiliation +create table work_author_affiliation ( - institution_name_id int not null identity(1, 1), - institution_name nvarchar(800) not null + work_id bigint not null, + author_seq smallint not null, + affiliation_seq smallint not null ) go -if exists (select * from master.dbo.sysdatabases where name = '$(previous_relational_db_name)') -begin - if exists (select * from $(previous_relational_db_name).sys.tables where [name] = 'institution_name') - begin - set identity_insert institution_name on - - insert into institution_name with(tablock) (institution_name_id, institution_name) - select institution_name_id, institution_name - from $(previous_relational_db_name)..institution_name - - set identity_insert institution_name off - end -end - -insert into institution_name with(tablock) -select institution_name -from #work_author_institution -where institution_name is not null -except -select institution_name -from institution_name -order by institution_name +insert into work_author_affiliation with(tablock) +select distinct a.work_id, + a.author_seq, + b.affiliation_seq +from #work_author_affiliation2 as a +join work_affiliation as b on a.work_id = b.work_id and a.raw_affiliation_string_id = b.raw_affiliation_string_id -alter table institution_name add constraint pk_institution_name primary key(institution_name_id) -create index idx_institution_name_institution_name on institution_name(institution_name) +alter table work_author_affiliation add constraint pk_work_author_affiliation primary key(work_id, author_seq, affiliation_seq) +alter table work_author_affiliation add constraint fk_work_author_affiliation_work_id_work_work_id foreign key(work_id) references work(work_id) +alter table work_author_affiliation add constraint fk_work_author_affiliation_work_id_author_seq_pub_author_work_id_author_seq foreign key(work_id, author_seq) references work_author(work_id, author_seq) +alter table work_author_affiliation add constraint fk_work_author_affiliation_work_id_affiliation_seq_work_affiliation_work_id_affiliation_seq foreign key(work_id, affiliation_seq) references work_affiliation(work_id, affiliation_seq) -drop table if exists #work_author_institution2 +drop table if exists #work_author_affiliation_institution2 select a.work_id, - a.author_institution_seq, a.author_seq, - a.institution_seq_org, - a.institution_id, - b.institution_name_id -into #work_author_institution2 -from #work_author_institution as a -left join institution_name as b on a.institution_name = b.institution_name + c.affiliation_seq, + a.author_affiliation_institution_seq, + a.institution_id +into #work_author_affiliation_institution2 +from #work_author_affiliation_institution as a +join #work_author_affiliation2 as b on a.work_id = b.work_id and a.author_seq = b.author_seq and a.author_affiliation_seq = b.author_affiliation_seq +join work_affiliation as c on a.work_id = c.work_id and b.raw_affiliation_string_id = c.raw_affiliation_string_id --- work_institution -drop table if exists work_institution -create table work_institution +-- work_affiliation_institution +drop table if exists work_affiliation_institution +create table work_affiliation_institution ( work_id bigint not null, + affiliation_seq smallint not null, institution_seq smallint not null, - institution_id bigint null, - institution_name_id int null + institution_id bigint not null ) go -insert into work_institution with(tablock) +insert into work_affiliation_institution with(tablock) select work_id, - institution_seq = row_number() over (partition by work_id order by min(author_institution_seq)), - institution_id, - institution_name_id -from #work_author_institution2 -group by work_id, institution_id, institution_name_id - -alter table work_institution add constraint pk_work_institution primary key(work_id, institution_seq) -create index idx_work_institution_institution_id on work_institution(institution_id) -create index idx_work_institution_institution_name_id on work_institution(institution_name_id) -alter table work_institution add constraint fk_work_institution_work_id_work_work_id foreign key(work_id) references work(work_id) -alter table work_institution add constraint fk_work_institution_institution_id_institution_institution_id foreign key(institution_id) references institution(institution_id) -alter table work_institution add constraint fk_work_institution_institution_name_id_institution_name_institution_name_id foreign key(institution_name_id) references institution_name(institution_name_id) - - - --- work_author_institution -drop table if exists work_author_institution -create table work_author_institution -( - work_id bigint not null, - author_seq smallint not null, - institution_seq smallint not null -) -go - -insert into work_author_institution with(tablock) -select distinct a.work_id, - a.author_seq, - b.institution_seq -from #work_author_institution2 as a -join work_institution as b on a.work_id = b.work_id and isnull(a.institution_id, -1) = isnull(b.institution_id, -1) and isnull(a.institution_name_id, -1) = isnull(b.institution_name_id, -1) - -alter table work_author_institution add constraint pk_work_author_institution primary key(work_id, author_seq, institution_seq) -alter table work_author_institution add constraint fk_work_author_institution_work_id_work_work_id foreign key(work_id) references work(work_id) -alter table work_author_institution add constraint fk_work_author_institution_work_id_author_seq_work_author_work_id_author_seq foreign key(work_id, author_seq) references work_author(work_id, author_seq) -alter table work_author_institution add constraint fk_work_author_institution_work_id_institution_seq_work_institution_work_id_institution_seq foreign key(work_id, institution_seq) references work_institution(work_id, institution_seq) + affiliation_seq, + institution_seq = row_number() over (partition by work_id, affiliation_seq order by institution_id), + institution_id +from #work_author_affiliation_institution2 +group by work_id, affiliation_seq, institution_id + +alter table work_affiliation_institution add constraint pk_work_affiliation_institution primary key(work_id, affiliation_seq, institution_seq) +alter table work_affiliation_institution add constraint fk_work_affiliation_institution_work_id_work_work_id foreign key(work_id) references work(work_id) +alter table work_affiliation_institution add constraint fk_work_affiliation_institution_work_id_affiliation_seq_work_affiliation_work_id_affiliation_seq foreign key(work_id, affiliation_seq) references work_affiliation(work_id, affiliation_seq) +alter table work_affiliation_institution add constraint fk_work_affiliation_institution_institution_id_institution_institution_id foreign key(institution_id) references institution(institution_id) diff --git a/src/sql/relational/table_scripts/10_citation.sql b/src/sql/relational/table_scripts/14_citation.sql similarity index 75% rename from src/sql/relational/table_scripts/10_citation.sql rename to src/sql/relational/table_scripts/14_citation.sql index 6b61744..263af1a 100644 --- a/src/sql/relational/table_scripts/10_citation.sql +++ b/src/sql/relational/table_scripts/14_citation.sql @@ -1,37 +1,32 @@ set nocount on drop table if exists #citation -select - citation_id = identity(int, 1, 1), - citing_work_id = work_id, - reference_seq, - cited_work_id +select citation_id, citing_work_id, reference_seq, cited_work_id into #citation -from work_reference - -drop table if exists #citation2 -select a.citation_id, a.citing_work_id, a.reference_seq, a.cited_work_id -into #citation2 from ( - select *, [filter] = row_number() over (partition by citing_work_id, cited_work_id order by reference_seq asc) - from #citation + select + citation_id = row_number() over (order by work_id, reference_seq), + citing_work_id = work_id, + reference_seq, + cited_work_id, + [filter] = row_number() over (partition by work_id, cited_work_id order by reference_seq asc) + from work_reference + where work_id <> cited_work_id ) as a where a.[filter] = 1 -drop table if exists #citation - drop table if exists #self_cit select distinct citation_id into #self_cit from ( select distinct a.citation_id, b.author_id - from #citation2 as a + from #citation as a join work_author as b on a.citing_work_id = b.work_id union all select distinct a.citation_id, b.author_id - from #citation2 as a + from #citation as a join work_author as b on a.cited_work_id = b.work_id ) as a group by citation_id, author_id @@ -65,17 +60,13 @@ select pub_year = c.pub_year, cit_window = b.pub_year - c.pub_year, is_self_cit = case when d.citation_id is not null then 1 else 0 end -from #citation2 as a +from #citation as a join #pub_year as b on a.citing_work_id = b.work_id join #pub_year as c on a.cited_work_id = c.work_id left join #self_cit as d on a.citation_id = d.citation_id -where b.work_id != c.work_id alter table citation add constraint pk_citation primary key(citing_work_id, reference_seq) create index idx_citation_citing_work_id on citation(citing_work_id) create index idx_citation_cited_work_id on citation(cited_work_id) alter table citation add constraint fk_citation_citing_work_id_work_work_id foreign key(citing_work_id) references work(work_id) alter table citation add constraint fk_citation_cited_work_id_work_work_id foreign key(cited_work_id) references work(work_id) - -drop table if exists #citation2 -drop table if exists #self_cit diff --git a/src/sql/relational/table_scripts/11_work_detail.sql b/src/sql/relational/table_scripts/15_work_detail.sql similarity index 87% rename from src/sql/relational/table_scripts/11_work_detail.sql rename to src/sql/relational/table_scripts/15_work_detail.sql index a79b24f..3ef662e 100644 --- a/src/sql/relational/table_scripts/11_work_detail.sql +++ b/src/sql/relational/table_scripts/15_work_detail.sql @@ -66,18 +66,24 @@ alter table #author_et_al add constraint pk_tmp_author_et_al primary key(work_id drop table if exists #work_institution select work_id, - institution_seq, - institution_id, - institution_seq2 = row_number() over (partition by work_id order by institution_seq) + institution_seq = row_number() over (partition by work_id order by min(affiliation_institution_seq)), + institution_id into #work_institution -from work_institution -where institution_id is not null +from +( + select + work_id, + affiliation_institution_seq = row_number() over (partition by work_id order by affiliation_seq, institution_seq), + institution_id + from work_affiliation_institution +) as a +group by work_id, institution_id -- If a publication has eight or more institutions, list the names of the first six institutions followed by an ellipsis and then the last institutions' name. drop table if exists #work_last_institution_seq select work_id, - last_institution_seq = max(institution_seq2) + last_institution_seq = max(institution_seq) into #work_last_institution_seq from #work_institution group by work_id @@ -86,15 +92,15 @@ update a set a.institution_id = -1 from #work_institution as a join #work_last_institution_seq as b on a.work_id = b.work_id -where b.last_institution_seq > 7 and a.institution_seq2 = 7 +where b.last_institution_seq > 7 and a.institution_seq = 7 delete a from #work_institution as a join #work_last_institution_seq as b on a.work_id = b.work_id -where b.last_institution_seq > 8 and a.institution_seq2 between 8 and (b.last_institution_seq - 1) +where b.last_institution_seq > 8 and a.institution_seq between 8 and (b.last_institution_seq - 1) drop table if exists #work_last_institution_seq -create index #work_institution on #work_institution(work_id, institution_seq2, institution_id) +create index #work_institution on #work_institution(work_id, institution_seq, institution_id) drop table if exists #institution_first select @@ -103,18 +109,18 @@ select into #institution_first from #work_institution as a join institution as b on a.institution_id = b.institution_id -where a.institution_seq2 = 1 +where a.institution_seq = 1 alter table #institution_first add constraint pk_tmp_institution_first primary key(work_id) drop table if exists #institution_et_al select a.work_id, - institution_et_al = nullif(string_agg(isnull(b.institution, '...'), '; ') within group (order by a.institution_seq2), '') + institution_et_al = nullif(string_agg(isnull(b.institution, '...'), '; ') within group (order by a.institution_seq), '') into #institution_et_al from #work_institution as a left join institution as b on a.institution_id = b.institution_id -where a.institution_seq2 > 1 +where a.institution_seq > 1 group by a.work_id drop table if exists #institution @@ -141,7 +147,7 @@ create table work_detail pages nvarchar(350) null, doi varchar(330) null, pmid int null, - work_type varchar(20) null, + work_type varchar(25) null, n_cits int not null, n_self_cits int not null ) diff --git a/src/sql/relational/table_scripts/12_update_reference_count.sql b/src/sql/relational/table_scripts/16_update_reference_count.sql similarity index 100% rename from src/sql/relational/table_scripts/12_update_reference_count.sql rename to src/sql/relational/table_scripts/16_update_reference_count.sql diff --git a/src/sql/relational/table_scripts/13_update_citation_counts.sql b/src/sql/relational/table_scripts/17_update_citation_counts.sql similarity index 100% rename from src/sql/relational/table_scripts/13_update_citation_counts.sql rename to src/sql/relational/table_scripts/17_update_citation_counts.sql diff --git a/src/sql/text/post_processing_scripts/01_create_full_text_index.sql b/src/sql/text/post_processing_scripts/01_create_full_text_index.sql index e9230a7..50691a7 100644 --- a/src/sql/text/post_processing_scripts/01_create_full_text_index.sql +++ b/src/sql/text/post_processing_scripts/01_create_full_text_index.sql @@ -9,7 +9,7 @@ go create fulltext catalog text_catalog go -create fulltext index on text_data(title, abstract) +create fulltext index on text_data(title, abstract, keywords) key index pk_text_data on text_catalog with stoplist off, change_tracking off, no population go diff --git a/src/sql/text/table_scripts/01_text_data.sql b/src/sql/text/table_scripts/01_text_data.sql index 219d2fa..d2911da 100644 --- a/src/sql/text/table_scripts/01_text_data.sql +++ b/src/sql/text/table_scripts/01_text_data.sql @@ -1,18 +1,13 @@ set nocount on -drop table if exists #text_data -select - a.work_id, - b.title, - c.abstract, - keywords = string_agg(cast(e.keyword as nvarchar(max)), '; ') -into #text_data -from $(relational_db_name)..work as a -left join $(relational_db_name)..work_title as b on a.work_id = b.work_id -left join $(relational_db_name)..work_abstract as c on a.work_id = c.work_id -left join $(relational_db_name)..work_keyword as d on a.work_id = d.work_id -left join $(relational_db_name)..keyword as e on d.keyword_id = e.keyword_id -group by a.work_id, b.title, c.abstract +drop table if exists #work_keywords +select a.work_id, keywords = string_agg(cast(b.keyword as nvarchar(max)), '; ') +into #work_keywords +from $(relational_db_name)..work_keyword as a +join $(relational_db_name)..keyword as b on a.keyword_id = b.keyword_id +group by a.work_id + +create clustered index idx_tmp_work_keywords_work_id on #work_keywords(work_id) drop table if exists text_data create table text_data @@ -24,8 +19,11 @@ create table text_data ) insert into text_data with(tablock) -select work_id, title, abstract, keywords -from #text_data -where not(title is null and abstract is null and keywords is null) +select a.work_id, b.title, c.abstract, d.keywords +from $(relational_db_name)..work as a +left join $(relational_db_name)..work_title as b on a.work_id = b.work_id +left join $(relational_db_name)..work_abstract as c on a.work_id = c.work_id +left join #work_keywords as d on a.work_id = d.work_id +where not(b.title is null and c.abstract is null and d.keywords is null) alter table text_data add constraint pk_text_data primary key(work_id)