-
Notifications
You must be signed in to change notification settings - Fork 169
Home
Titipat Achakulvisut edited this page Jun 11, 2016
·
18 revisions
- alternative parser http://fnl.es/medline-kung-fu.html
- Pubmed Open-Access (OA) dataset is available at http://www.ncbi.nlm.nih.gov/pmc/tools/ftp/
- the MEDLINE xmls are available here ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/
- the MEDLINE xmls weekly updates are available here ftp://ftp.nlm.nih.gov/nlmdata/.medlease/gz/
- transform MEDLINE XML to json using node, https://github.com/ldbib/MEDLINEXMLToJSON
Here is a snippet from MEDLINE DTD. We can use it to see available tag from MEDLINE xml.
<!ELEMENT MedlineCitationSet (MedlineCitation*, DeleteCitation?)>
<!ELEMENT MedlineCitation (PMID, DateCreated, DateCompleted?, DateRevised?, Article, MedlineJournalInfo, ChemicalList?, SupplMeshList?,CitationSubset*, CommentsCorrectionsList?, GeneSymbolList?, MeshHeadingList?,NumberOfReferences?, PersonalNameSubjectList?, OtherID*, OtherAbstract*, KeywordList*, SpaceFlightMission*, InvestigatorList?, GeneralNote*)>
<!ATTLIST MedlineCitation
Owner (NLM | NASA | PIP | KIE | HSR | HMD | NOTNLM) "NLM"
Status (Completed | In-Process | PubMed-not-MEDLINE | In-Data-Review | Publisher | MEDLINE | OLDMEDLINE) #REQUIRED
VersionID CDATA #IMPLIED
VersionDate CDATA #IMPLIED>
<!ELEMENT Abstract (AbstractText+, CopyrightInformation?)>
<!ELEMENT AbstractText (#PCDATA)>
<!ATTLIST AbstractText
Label CDATA #IMPLIED
NlmCategory (BACKGROUND | OBJECTIVE | METHODS | RESULTS | CONCLUSIONS | UNASSIGNED) #IMPLIED>
<!ELEMENT AccessionNumber (#PCDATA)>
<!ELEMENT AccessionNumberList (AccessionNumber+)>
<!ELEMENT Acronym (#PCDATA)>
<!ELEMENT Affiliation (#PCDATA)>
<!ELEMENT AffiliationInfo (Affiliation, Identifier*)>
<!ELEMENT Agency (#PCDATA)>
<!ELEMENT Article (Journal,ArticleTitle,((Pagination, ELocationID*) | ELocationID+),Abstract?,AuthorList?, Language+, DataBankList?, GrantList?,PublicationTypeList, VernacularTitle?, ArticleDate*)>
<!ATTLIST Article
PubModel (Print | Print-Electronic | Electronic | Electronic-Print | Electronic-eCollection) #REQUIRED>
<!ELEMENT ArticleDate (Year,Month,Day)>
<!ATTLIST ArticleDate DateType CDATA #FIXED "Electronic">
<!ELEMENT ArticleTitle (#PCDATA)>
<!ELEMENT Author (((LastName, ForeName?, Initials?, Suffix?) | CollectiveName), Identifier*, AffiliationInfo*)>
<!ATTLIST Author ValidYN (Y | N) "Y">
<!ELEMENT AuthorList (Author+)>
<!ATTLIST AuthorList CompleteYN (Y | N) "Y">
<!ELEMENT Chemical (RegistryNumber,NameOfSubstance)>
<!ELEMENT ChemicalList (Chemical+)>
<!ELEMENT CitationSubset (#PCDATA)>
<!ELEMENT CollectiveName (#PCDATA)>
<!ELEMENT CommentsCorrections (RefSource,PMID?,Note?)>
<!ATTLIST CommentsCorrections
RefType (AssociatedDataset | AssociatedPublication | CommentOn | CommentIn | ErratumIn | ErratumFor | PartialRetractionIn | PartialRetractionOf | RepublishedFrom | RepublishedIn | RetractionOf | RetractionIn | UpdateIn | UpdateOf | SummaryForPatientsIn | OriginalReportIn | ReprintOf | ReprintIn | Cites) #REQUIRED >
<!ELEMENT CommentsCorrectionsList (CommentsCorrections+)>
<!ELEMENT CopyrightInformation (#PCDATA)>
<!ELEMENT Country (#PCDATA)>
<!ELEMENT DataBank (DataBankName, AccessionNumberList?)>
<!ELEMENT DataBankList (DataBank+)>
<!ATTLIST DataBankList CompleteYN (Y | N) "Y">
<!ELEMENT DataBankName (#PCDATA)>
<!ELEMENT DateCompleted (Year,Month,Day)>
<!ELEMENT DateCreated (Year,Month,Day)>
<!ELEMENT DateRevised (Year,Month,Day)>
<!ELEMENT Day (#PCDATA)>
<!ELEMENT DescriptorName (#PCDATA)>
<!ATTLIST DescriptorName
MajorTopicYN (Y | N) "N"
Type (Geographic) #IMPLIED
UI CDATA #REQUIRED>
<!ELEMENT ELocationID (#PCDATA)>
<!ATTLIST ELocationID EIdType (doi | pii) #REQUIRED
ValidYN (Y | N) "Y">
<!ELEMENT EndPage (#PCDATA)>
<!ELEMENT ForeName (#PCDATA)>
<!ELEMENT GeneSymbol (#PCDATA)>
<!ELEMENT GeneSymbolList (GeneSymbol+)>
<!ELEMENT GeneralNote (#PCDATA)>
<!ATTLIST GeneralNote
Owner (NLM | NASA | PIP | KIE | HSR | HMD) "NLM">
<!ELEMENT Grant (GrantID?, Acronym?, Agency, Country)>
<!ELEMENT GrantID (#PCDATA)>
<!ELEMENT GrantList (Grant+)>
<!ATTLIST GrantList CompleteYN (Y | N) "Y">
<!ELEMENT Identifier (#PCDATA)>
<!ATTLIST Identifier
Source CDATA #REQUIRED >
<!ELEMENT ISOAbbreviation (#PCDATA)>
<!ELEMENT ISSN (#PCDATA)>
<!ATTLIST ISSN
IssnType (Electronic | Print) #REQUIRED>
<!ELEMENT ISSNLinking (#PCDATA)>
<!ELEMENT Initials (#PCDATA)>
<!ELEMENT Investigator (LastName, ForeName?, Initials?, Suffix?, Identifier*, AffiliationInfo*)>
<!ATTLIST Investigator
ValidYN (Y | N) "Y">
<!ELEMENT InvestigatorList (Investigator+)>
<!ELEMENT Issue (#PCDATA)>
<!ELEMENT Journal (ISSN?, JournalIssue, Title?, ISOAbbreviation?)>
<!ELEMENT JournalIssue (Volume?, Issue?, PubDate)>
<!ATTLIST JournalIssue
CitedMedium (Internet | Print) #REQUIRED>
<!ELEMENT Keyword (#PCDATA)>
<!ATTLIST Keyword
MajorTopicYN (Y | N) "N">
<!ELEMENT KeywordList (Keyword+)>
<!ATTLIST KeywordList
Owner (NLM | NLM-AUTO | NASA | PIP | KIE | NOTNLM | HHS) "NLM">
<!ELEMENT Language (#PCDATA)>
<!ELEMENT LastName (#PCDATA)>
<!ELEMENT MedlineDate (#PCDATA)>
<!ELEMENT MedlineJournalInfo (Country?, MedlineTA, NlmUniqueID?, ISSNLinking?)>
<!ELEMENT MedlinePgn (#PCDATA)>
<!ELEMENT MedlineTA (#PCDATA)>
<!ELEMENT MeshHeading (DescriptorName, QualifierName*)>
<!ELEMENT MeshHeadingList (MeshHeading+)>
<!ELEMENT Month (#PCDATA)>
<!ELEMENT NameOfSubstance (#PCDATA)>
<!ATTLIST NameOfSubstance
UI CDATA #REQUIRED>
<!ELEMENT NlmUniqueID (#PCDATA)>
<!ELEMENT Note (#PCDATA)>
<!ELEMENT NumberOfReferences (#PCDATA)>
<!ELEMENT OtherAbstract (AbstractText+, CopyrightInformation?)>
<!ATTLIST OtherAbstract
Type (AAMC | AIDS | KIE | PIP | NASA | Publisher) #REQUIRED
Language CDATA "eng">
<!ELEMENT OtherID (#PCDATA)>
<!ATTLIST OtherID
Source (NASA | KIE | PIP | POP | ARPL | CPC | IND | CPFH | CLML | NRCBL | NLM | QCIM) #REQUIRED>
<!ELEMENT PMID (#PCDATA)>
<!ATTLIST PMID
Version CDATA #REQUIRED>
<!ELEMENT Pagination ((StartPage, EndPage?, MedlinePgn?) | MedlinePgn)>
<!ELEMENT PersonalNameSubject (LastName, ForeName?, Initials?, Suffix?)>
<!ELEMENT PersonalNameSubjectList (PersonalNameSubject+)>
<!ELEMENT PubDate ((Year, ((Month, Day?) | Season)?) | MedlineDate)>
<!ELEMENT PublicationType (#PCDATA)>
<!ATTLIST PublicationType
UI CDATA #REQUIRED>
<!ELEMENT PublicationTypeList (PublicationType+)>
<!ELEMENT QualifierName (#PCDATA)>
<!ATTLIST QualifierName
MajorTopicYN (Y | N) "N"
UI CDATA #REQUIRED>
<!ELEMENT RefSource (#PCDATA)>
<!ELEMENT RegistryNumber (#PCDATA)>
<!ELEMENT Season (#PCDATA)>
<!ELEMENT SpaceFlightMission (#PCDATA)>
<!ELEMENT StartPage (#PCDATA)>
<!ELEMENT Suffix (#PCDATA)>
<!ELEMENT SupplMeshList (SupplMeshName+)>
<!ELEMENT SupplMeshName (#PCDATA)>
<!ATTLIST SupplMeshName
Type (Disease | Protocol) #REQUIRED
UI CDATA #REQUIRED>
<!ELEMENT Title (#PCDATA)>
<!ELEMENT VernacularTitle (#PCDATA)>
<!ELEMENT Volume (#PCDATA)>
<!ELEMENT Year (#PCDATA)>
<!ELEMENT DeleteCitation (PMID+)>