Skip to content

Add missing authors and journals

James Kent edited this page Mar 22, 2024 · 1 revision
from sqlachemy import func
from sqlalchemy import func
from sqlalchemy import or_
BaseStudy.query.count(BaseStudy.id).group_by(BaseStudy.year)
BaseStudy.query.count().group_by(BaseStudy.year)
BaseStudy.query.count()
query = session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)

# Executing the query and fetching the results
results = query.all()
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)

# Executing the query and fetching the results
results = query.all()
results
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)


# Executing the query and fetching the results
results = query.all()
results
from sqlalchemy import in_
from sqlalchemy import in
values_to_check = [None, 0, 1, 3, 9, 13, 16, 19]
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).all()
BaseStudy.query.filter(BaseStudy.year==None)
BaseStudy.query.filter(BaseStudy.year==None).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).all()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).count()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None))
bad_year
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).filter(BaseStudy.pmid != None).all()
len(bad_year)
from Bio import Entrez
from Bio.Entrez.Parser import ValidationError

def get_publication_year(pmid):
    """
    Retrieves the publication year for a given PMID using BioPython.

    Args:
    - pmid (str): PubMed ID (PMID) of the publication.

    Returns:
    - int or None: The publication year if found, otherwise None.
    """
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"

        # Fetch PubMed record for the given PMID
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()

        # Extract publication year from the record
        lines = record.splitlines()
        for line in lines:
            if line.startswith("DP  - "):
                year_str = line[6:10]
                try:
                    year = int(year_str)
                    return year
                except ValueError:
                    return None  # Invalid year format
        return None  # Publication year not found

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve publication year.")
        return None

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return None
get_publication_year(bad_year[0].pmid)
get_publication_year(bad_year[1].pmid)
get_publication_year(bad_year[2].pmid)
to_commit = []
for bs in bad_year:
    year = get_publication_year(bs.pmid)
    if not year:
        print(f"NO YEAR FOR {bs.pmid}")
        continue
    bs.year = year
    to_commit.append(bs)
    for v in bs.versions:
        v.year = year
        to_commit.append(v)
to_commit
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)


# Executing the query and fetching the results
results = query.all()
results
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.year=='', func.trim(BaseStudy.year)=='')).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
db.session.rollback()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
len(bad_journal)
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).all()
len(bad_journal)
from sqlalchemy import joinload
from sqlalchemy import joinedload
from sqlalchemy.orm import joinedload
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).opetions(joinedload(BaseStudy.versions)).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()

def get_journal_names(pmids):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    journal_names = {}
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                journal_names[pmid] = journal_name

        return journal_names

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return {}

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return {}
CHUNK_SIZE = 900
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(studies), CHUNK_SIZE)]
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks = [bad_journal[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks

def get_journal_names(bs):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return {}

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return {}

def get_journal_names(bs):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal_name
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal_name
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))
to_commit
len(to_commit)
to_commit[10]
to_commit[10].publication
to_commit[100].publication
to_commit[1000].publication
to_commit[1500].publication
to_commit[2000].publication
to_commit[2002].publication
db.session.add_all(to_commit)
db.session.commit()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_journal)
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
chunks = [bad_authors[i:i+CHUNK_SIZE] for i in range(0, len(bad_authors), CHUNK_SIZE)]
chunks

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
                    print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    # print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit
len(to_commit)
to_commit[0].authors

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "[email protected]"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    # print(f"{line[6:]}")
            if pmid and authors:
                authors = ";".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit
to_commit[0].authors
to_commit[1000].authors
to_commit[5000].authors
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
from neurostore.core import cache
cache.clear()
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)
history