diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py
index 9da08925c..fb08bbd30 100644
--- a/ena-submission/scripts/test_ena_submission.py
+++ b/ena-submission/scripts/test_ena_submission.py
@@ -23,6 +23,7 @@
     dataclass_to_xml,
     get_chromsome_accessions,
     get_ena_analysis_process,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from ena_deposition.ena_types import default_project_type, default_sample_type
 
@@ -188,6 +189,12 @@ def setUp(self):
         }
         self.seq_key = {"accession": "test_accession", "version": "test_version"}
 
+    def test_format_authors(self):
+        authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
+        result = reformat_authors_from_loculus_to_embl_style(authors)
+        desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
+        self.assertEqual(result, desired_result)
+
     def test_create_chromosome_list_multi_segment(self):
         chromosome_list = create_chromosome_list_object(
             self.unaligned_sequences_multi, self.seq_key
diff --git a/ena-submission/src/ena_deposition/config.py b/ena-submission/src/ena_deposition/config.py
index 092c59802..0fefc3dca 100644
--- a/ena-submission/src/ena_deposition/config.py
+++ b/ena-submission/src/ena_deposition/config.py
@@ -50,7 +50,7 @@ def secure_ena_connection(config: Config):
         config.test = True
         logging.info("Submitting to ENA dev environment")
         config.ena_submission_url = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit"
-        config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json"
+        config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json"
         config.ena_reports_service_url = "https://wwwdev.ebi.ac.uk/ena/submit/report"
 
     if submit_to_ena_prod:
diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py
index a6c51d38c..c12a3aa57 100644
--- a/ena-submission/src/ena_deposition/create_assembly.py
+++ b/ena-submission/src/ena_deposition/create_assembly.py
@@ -17,6 +17,7 @@
     create_manifest,
     get_ena_analysis_process,
     get_ena_config,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from .ena_types import (
     AssemblyChromosomeListFile,
@@ -129,6 +130,12 @@ def create_manifest_object(
     authors = (
         metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
     )
+    try:
+        authors = reformat_authors_from_loculus_to_embl_style(authors)
+    except ValueError as err:
+        msg = f"Was unable to format authors: {authors} as ENA expects"
+        logger.error(msg)
+        raise ValueError(msg) from err
     collection_date = metadata.get("sampleCollectionDate", "Unknown")
     country = metadata.get("geoLocCountry", "Unknown")
     admin1 = metadata.get("geoLocAdmin1", "")
diff --git a/ena-submission/src/ena_deposition/ena_submission_helper.py b/ena-submission/src/ena_deposition/ena_submission_helper.py
index 6da10c99f..fcca90575 100644
--- a/ena-submission/src/ena_deposition/ena_submission_helper.py
+++ b/ena-submission/src/ena_deposition/ena_submission_helper.py
@@ -127,6 +127,28 @@ def get_project_xml(project_set):
     }
 
 
+def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
+    """This function reformats the Loculus authors string to the format expected by ENA
+    Loculus format: `Doe, John A.; Roe, Jane B. C.`
+    EMBL expected: `Doe J.A., Roe J.B.C.;`
+
+    EMBL spec: "The names are listed surname first followed by a blank
+      followed by initial(s) with stops.
+      Occasionally the initials may not be known,
+      in which case the surname alone will be listed.
+      The author names are separated by commas
+      and terminated by a semicolon; they are not split between lines."
+    See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
+    authors_list = [author for author in authors.split(";") if author]
+    ena_authors = []
+    for author in authors_list:
+        last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
+        initials = ".".join([name[0] for name in first_name.split(" ") if name])
+        initials = initials + "." if initials else initials
+        ena_authors.append(f"{last_name} {initials}")
+    return ", ".join(ena_authors) + ";"
+
+
 def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
     """
     The project creation request should be equivalent to 
diff --git a/ingest/Snakefile b/ingest/Snakefile
index ca8da1ba4..818fb7cd1 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -21,7 +21,6 @@ with open("results/config.yaml", "w") as f:
 
 TAXON_ID = config["taxon_id"]
 SEGMENTED = config["segmented"]
-COLUMN_MAPPING = config["column_mapping"]
 LOG_LEVEL = config.get("log_level", "INFO")
 NCBI_API_KEY = os.getenv("NCBI_API_KEY")
 FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
@@ -29,16 +28,6 @@ APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min")  # time in minutes
 CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)
 
 
-def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
-    with open(input_file, "r") as f:
-        header = f.readline().strip().split("\t")
-        header = [mapping.get(h, h) for h in header]
-        with open(output_file, "w") as g:
-            g.write("\t".join(header) + "\n")
-            for line in f:
-                g.write(line)
-
-
 rule all:
     params:
         config=lambda wildcards: str(config),
@@ -72,30 +61,21 @@ rule fetch_ncbi_dataset_package:
 
 rule format_ncbi_dataset_report:
     input:
+        script="scripts/format_ncbi_metadata.py",
         dataset_package="results/ncbi_dataset.zip",
+        config="results/config.yaml",
     output:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
     shell:
         """
-        dataformat tsv virus-genome \
-            --package {input.dataset_package} \
-            > {output.ncbi_dataset_tsv}
+        unzip -o {input.dataset_package} -d results
+        python {input.script} \
+            --config-file {input.config} \
+            --input results/ncbi_dataset/data/data_report.jsonl \
+            --output {output.ncbi_dataset_tsv}
         """
 
 
-rule rename_columns:
-    input:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
-    output:
-        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
-    params:
-        mapping=COLUMN_MAPPING,
-    run:
-        rename_columns(
-            input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
-        )
-
-
 if CHECK_ENA_DEPOSITION:
 
     rule get_loculus_depositions:
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
index 71354e67f..4a4538cc5 100644
--- a/ingest/config/defaults.yaml
+++ b/ingest/config/defaults.yaml
@@ -6,8 +6,8 @@ log_level: DEBUG
 compound_country_field: ncbiGeoLocation
 fasta_id_field: genbankAccession
 keep:
-  - division
-  - country
+  - geoLocAdmin1
+  - geoLocAdmin2
   - submissionId
   - insdcAccessionBase
   - insdcVersion
@@ -24,52 +24,44 @@ keep:
   - sequence_md5
   - genbankAccession
   - jointAccession
-column_mapping:
-  Accession: genbankAccession
-  BioProjects: bioprojects
-  BioSample accession: biosampleAccession
-  Gene count: ncbi_gene_count
-  Geographic Location: ncbiGeoLocation
-  Geographic Region: ncbiGeoRegion
-  Host Common Name: ncbiHostCommonName
-  Host Infraspecific Names Breed: ncbiHostBreed
-  Host Infraspecific Names Cultivar: ncbiHostCultivar
-  Host Infraspecific Names Ecotype: ncbiHostEcotype
-  Host Infraspecific Names Isolate: ncbiHostIsolate
-  Host Infraspecific Names Sex: ncbiHostSex
-  Host Infraspecific Names Strain: ncbiHostStrain
-  Host Name: ncbiHostName
-  Host Pangolin Classification: ncbiHostPangolin
-  Host Taxonomic ID: ncbiHostTaxId
-  Is Annotated: ncbiIsAnnotated
-  Is Complete: ncbiIsComplete
-  Is Lab Host: ncbiIsLabHost
-  Is Vaccine Strain: ncbiIsVaccineStrain
-  Isolate Collection date: ncbiCollectionDate
-  Isolate Lineage: ncbiIsolateName
-  Isolate Lineage source: ncbiIsolateSource
-  Lab Host: ncbiLabHost
-  Mature peptide count: ncbiMaturePeptideCount
-  Molecule type: ncbiMolType
-  Protein count: ncbiProteinCount
-  Purpose of Sampling: ncbiPurposeOfSampling
-  Release date: ncbiReleaseDate
-  Source database: ncbiSourceDb
-  SRA Accessions: ncbiSraAccessions
-  Submitter Affiliation: ncbiSubmitterAffiliation
-  Submitter Country: ncbiSubmitterCountry
-  Submitter Names: ncbiSubmitterNames
-  Update date: ncbiUpdateDate
-  Virus Common Name: ncbiVirusCommonName
-  Virus Infraspecific Names Breed: ncbiVirusBreed
-  Virus Infraspecific Names Cultivar: ncbiVirusCultivar
-  Virus Infraspecific Names Ecotype: ncbiVirusEcotype
-  Virus Infraspecific Names Isolate: ncbiVirusIsolate
-  Virus Infraspecific Names Sex: ncbi_virus
-  Virus Infraspecific Names Strain: ncbiVirusStrain
-  Virus Name: ncbiVirusName
-  Virus Pangolin Classification: ncbiVirusPangolin
-  Virus Taxonomic ID: ncbiVirusTaxId
+simple_mappings:
+  "ncbiReleaseDate": "releaseDate"
+  "ncbiIsAnnotated": "isAnnotated"
+  "ncbiIsLabHost": "isLabHost"
+  "ncbiProteinCount": "proteinCount"
+  "ncbiSourceDb": "sourceDatabase"
+  "ncbiIsComplete": "completeness"
+  "ncbiLabHost": "labHost"
+  "ncbiUpdateDate": "updateDate"
+  "genbankAccession": "accession"
+  "biosampleAccession": "biosample"
+  "ncbi_gene_count": "geneCount"
+  "bioprojects": "bioprojects"
+  "ncbiSraAccessions": "sraAccessions"
+location_mappings:
+  "ncbiGeoLocation": "geographicLocation"
+  "ncbiGeoRegion": "geographicRegion"
+submitter_mappings:
+  "ncbiSubmitterAffiliation": "affiliation"
+  "ncbiSubmitterNames": "names"
+  "ncbiSubmitterCountry": "country"
+isolate_mappings:
+  "ncbiIsolateName": "name"
+  "ncbiIsolateSource": "source"
+  "ncbiCollectionDate": "collectionDate"
+virus_mappings:
+  "ncbiVirusName": "organismName"
+  "ncbiVirusTaxId": "taxId"
+host_mappings:
+  "ncbiHostTaxId": "taxId"
+  "ncbiHostName": "organismName"
+parse_list:
+  - bioprojects
+  - ncbiSraAccessions
+unknown_mappings: # I don't know yet where these fields come from
+  - ncbiHostCommonName
+  - ncbiPurposeOfSampling
+  - ncbiHostSex
 group_name: insdc_ingest_group  # Used only to set the group name, never read
 username: insdc_ingest_user
 password: insdc_ingest_user
@@ -79,3 +71,335 @@ approve_timeout_min: "25" # Cronjobs run every 30min, make approve stop before i
 db_username: postgres
 db_password: unsecure
 db_url: "jdbc:postgresql://127.0.0.1:5432/loculus"
+min_score: 89
+country_codes:
+  Afghanistan: AF
+  Albania: AL
+  Algeria: DZ
+  American Samoa: AS
+  Andorra: AD
+  Angola: AO
+  Anguilla: AI
+  Antarctica: AQ
+  Antigua and Barbuda: AG
+  Arctic Ocean: null
+  Argentina: AR
+  Armenia: AM
+  Aruba: AW
+  Ashmore and Cartier Islands: null
+  Atlantic Ocean: null
+  Australia: AU
+  Austria: AT
+  Azerbaijan: AZ
+  Bahamas: BS
+  Bahrain: BH
+  Baltic Sea: null
+  Baker Island: null
+  Bangladesh: BD
+  Barbados: BB
+  Bassas da India: null
+  Belarus: BY
+  Belgium: BE
+  Belize: BZ
+  Benin: BJ
+  Bermuda: BM
+  Bhutan: BT
+  Bolivia: BO
+  Borneo: null
+  Bosnia and Herzegovina: BA
+  Botswana: BW
+  Bouvet Island: BV
+  Brazil: BR
+  British Virgin Islands: VG
+  Brunei: BN
+  Bulgaria: BG
+  Burkina Faso: BF
+  Burundi: BI
+  Cambodia: KH
+  Cameroon: CM
+  Canada: CA
+  Cape Verde: CV
+  Cayman Islands: KY
+  Central African Republic: CF
+  Chad: TD
+  Chile: CL
+  China: CN
+  Christmas Island: CX
+  Clipperton Island: null
+  Cocos Islands: CC
+  Colombia: CO
+  Comoros: KM
+  Cook Islands: CK
+  Coral Sea Islands: null
+  Costa Rica: CR
+  Cote d'Ivoire: CI
+  Croatia: HR
+  Cuba: CU
+  Curacao: CW
+  Cyprus: CY
+  Czechia: CZ
+  Democratic Republic of the Congo: CD
+  Denmark: DK
+  Djibouti: DJ
+  Dominica: DM
+  Dominican Republic: DO
+  Ecuador: EC
+  Egypt: EG
+  El Salvador: SV
+  Equatorial Guinea: GQ
+  Eritrea: ER
+  Estonia: EE
+  Eswatini: SZ
+  Ethiopia: ET
+  Europa Island: null
+  Falkland Islands (Islas Malvinas): FK
+  Faroe Islands: FO
+  Fiji: FJ
+  Finland: FI
+  France: FR
+  French Guiana: GF
+  French Polynesia: PF
+  French Southern and Antarctic Lands: TF
+  Gabon: GA
+  Gambia: GM
+  Gaza Strip: null
+  Georgia: GE
+  Germany: DE
+  Ghana: GH
+  Gibraltar: GI
+  Glorioso Islands: null
+  Greece: GR
+  Greenland: GL
+  Grenada: GD
+  Guadeloupe: GP
+  Guam: GU
+  Guatemala: GT
+  Guernsey: GG
+  Guinea: GN
+  Guinea-Bissau: GW
+  Guyana: GY
+  Haiti: HT
+  Heard Island and McDonald Islands: HM
+  Honduras: HN
+  Hong Kong: HK
+  Howland Island: null
+  Hungary: HU
+  Iceland: IS
+  India: IN
+  Indian Ocean: null
+  Indonesia: ID
+  Iran: IR
+  Iraq: IQ
+  Ireland: IE
+  Isle of Man: IM
+  Israel: IL
+  Italy: IT
+  Jamaica: JM
+  Jan Mayen: null
+  Japan: JP
+  Jarvis Island: null
+  Jersey: JE
+  Johnston Atoll: null
+  Jordan: JO
+  Juan de Nova Island: null
+  Kazakhstan: KZ
+  Kenya: KE
+  Kerguelen Archipelago: null
+  Kingman Reef: null
+  Kiribati: KI
+  Kosovo: null # XK is not recognized by the ISO
+  Kuwait: KW
+  Kyrgyzstan: KG
+  Laos: LA
+  Latvia: LV
+  Lebanon: LB
+  Lesotho: LS
+  Liberia: LR
+  Libya: LY
+  Liechtenstein: LI
+  Line Islands: null
+  Lithuania: LT
+  Luxembourg: LU
+  Macau: MO
+  Madagascar: MG
+  Malawi: MW
+  Malaysia: MY
+  Maldives: MV
+  Mali: ML
+  Malta: MT
+  Marshall Islands: MH
+  Martinique: MQ
+  Mauritania: MR
+  Mauritius: MU
+  Mayotte: YT
+  Mediterranean Sea: null
+  Mexico: MX
+  Micronesia, Federated States of: FM
+  Midway Islands: null
+  Moldova: MD
+  Monaco: MC
+  Mongolia: MN
+  Montenegro: ME
+  Montserrat: MS
+  Morocco: MA
+  Mozambique: MZ
+  Myanmar: MM
+  Namibia: NA
+  Nauru: NR
+  Navassa Island: null
+  Nepal: NP
+  Netherlands: NL
+  New Caledonia: NC
+  New Zealand: NZ
+  Nicaragua: NI
+  Niger: NE
+  Nigeria: NG
+  Niue: NU
+  Norfolk Island: NF
+  North Korea: KP
+  North Macedonia: MK
+  North Sea: null
+  Northern Mariana Islands: MP
+  Norway: NO
+  Oman: OM
+  Pacific Ocean: null
+  Pakistan: PK
+  Palau: PW
+  Palmyra Atoll: null
+  Panama: PA
+  Papua New Guinea: PG
+  Paracel Islands: null
+  Paraguay: PY
+  Peru: PE
+  Philippines: PH
+  Pitcairn Islands: PN
+  Poland: PL
+  Portugal: PT
+  Puerto Rico: PR
+  Qatar: QA
+  Republic of the Congo: CG
+  Reunion: RE
+  Romania: RO
+  Ross Sea: null
+  Russia: RU
+  Rwanda: RW
+  Saint Barthelemy: BL
+  Saint Helena: SH
+  Saint Kitts and Nevis: KN
+  Saint Lucia: LC
+  Saint Martin: MF
+  Saint Pierre and Miquelon: PM
+  Saint Vincent and the Grenadines: VC
+  Samoa: WS
+  San Marino: SM
+  Sao Tome and Principe: ST
+  Saudi Arabia: SA
+  Senegal: SN
+  Serbia: RS
+  Seychelles: SC
+  Sierra Leone: SL
+  Singapore: SG
+  Sint Maarten: SX
+  Slovakia: SK
+  Slovenia: SI
+  Solomon Islands: SB
+  Somalia: SO
+  South Africa: ZA
+  South Georgia and the South Sandwich Islands: GS
+  South Korea: KR
+  South Sudan: SS
+  Southern Ocean: null
+  Spain: ES
+  Spratly Islands: null
+  Sri Lanka: LK
+  State of Palestine: PS
+  Sudan: SD
+  Suriname: SR
+  Svalbard: SJ
+  Sweden: SE
+  Switzerland: CH
+  Syria: SY
+  Taiwan: TW
+  Tajikistan: TJ
+  Tanzania: TZ
+  Tasman Sea: null
+  Thailand: TH
+  Timor-Leste: TL
+  Togo: TG
+  Tokelau: TK
+  Tonga: TO
+  Trinidad and Tobago: TT
+  Tromelin Island: null
+  Tunisia: TN
+  Turkey: TR
+  Turkmenistan: TM
+  Turks and Caicos Islands: TC
+  Tuvalu: TV
+  Uganda: UG
+  Ukraine: UA
+  United Arab Emirates: AE
+  United Kingdom: GB
+  Uruguay: UY
+  USA: US
+  Uzbekistan: UZ
+  Vanuatu: VU
+  Venezuela: VE
+  Viet Nam: VN
+  Virgin Islands: VI
+  Wake Island: null
+  Wallis and Futuna: WF
+  West Bank: null
+  Western Sahara: EH
+  Yemen: YE
+  Zambia: ZM
+  Zimbabwe: ZW
+  Belgian Congo: CGO
+  British Guiana: BG
+  Burma: BU
+  Czechoslovakia: CS
+  Czech Republic: CZ
+  East Timor: TP
+  Korea: null
+  Macedonia: MK
+  Micronesia: FM
+  Netherlands Antilles: AN
+  Serbia and Montenegro: CS
+  Siam: null
+  Swaziland: SZ
+  The former Yugoslav Republic of Macedonia: MK
+  USSR: SU
+  Yugoslavia: YU
+  Zaire: ZR
+administrative_divisions:
+  - "autonomous region"
+  - region
+  - state
+  - province
+  - oblast
+  - krug
+  - territory
+  - zone
+  - subdistrict
+  - district
+  - canton
+  - subprefecture
+  - prefecture
+  - county
+  - department
+  - municipality
+  - republic
+  - governorate
+  - parish
+  - borough
+  - commune
+  - division
+  - ward
+  - emirate
+  - principality
+  - "federal subject"
+  - "union territory"
+  - department
+  - commune
+  - "council area"
+
+
diff --git a/ingest/environment.yml b/ingest/environment.yml
index 505fcddd2..d68c97d1b 100644
--- a/ingest/environment.yml
+++ b/ingest/environment.yml
@@ -21,3 +21,6 @@ dependencies:
   - snakemake
   - tsv-utils
   - unzip
+  - pycountry
+  - fuzzywuzzy
+  - unidecode
diff --git a/ingest/scripts/format_ncbi_metadata.py b/ingest/scripts/format_ncbi_metadata.py
new file mode 100644
index 000000000..2b72b01fd
--- /dev/null
+++ b/ingest/scripts/format_ncbi_metadata.py
@@ -0,0 +1,166 @@
+import csv
+import json
+import logging
+import re
+from dataclasses import dataclass
+
+import click
+import pandas as pd
+import yaml
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    encoding="utf-8",
+    level=logging.DEBUG,
+    format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
+    datefmt="%H:%M:%S",
+)
+
+
+@dataclass
+class Config:
+    simple_mappings: dict[str, str]
+    location_mappings: dict[str, str]
+    submitter_mappings: dict[str, str]
+    isolate_mappings: dict[str, str]
+    virus_mappings: dict[str, str]
+    host_mappings: dict[str, str]
+    unknown_mappings: list[str]
+    parse_list: list[str]
+
+
+def convert_to_title_case(name: str) -> str:
+    # List of lowercase particles or prepositions commonly used in names
+    lowercase_particles = ["de", "la", "van", "den", "der", "le", "du", "von", "del"]
+    title_case_text = name.title()
+
+    words = title_case_text.split()
+    result = []
+    for word in words:
+        if word.lower() in lowercase_particles:
+            result.append(word.lower())
+        else:
+            result.append(word)
+    return " ".join(result)
+
+
+def reformat_authors_from_genbank_to_loculus(
+    authors_list: list[str], insdc_accession_base: str
+) -> str:
+    """Split authors by each second comma, then split by comma and reverse
+    So "['Xi,L.', 'Yu,X.']" becomes  Xi, L.; Yu, X.
+    Where first name and last name are separated by no-break space"""
+
+    if not authors_list:
+        return ""
+
+    formatted_authors_list = []
+
+    for author in authors_list:
+        author_single_white_space = re.sub(r"\s\s+", " ", author)
+        names = [a for a in author_single_white_space.split(",") if a]
+        if len(names) == 2:
+            author_formatted = f"{names[0].strip()}, {names[1].strip()}"
+        elif len(names) == 1:
+            author_formatted = f"{names[0].strip()}, "
+        else:
+            msg = (
+                f"{insdc_accession_base}: Unexpected number of commas in author {author} "
+                f"not adding author to authors list"
+            )
+            logger.error(msg)
+            continue
+        formatted_authors_list.append(author_formatted)
+
+    formatted_authors = "; ".join(formatted_authors_list)
+
+    # If entire string is uppercase, convert to title case, some journals do this
+    if formatted_authors.isupper():
+        formatted_authors = convert_to_title_case(formatted_authors)
+    return formatted_authors
+
+
+def extract_fields(row, config: Config) -> dict:
+    try:
+        extracted = {}
+        extracted.update({key: row.get(value) for key, value in config.simple_mappings.items()})
+        location = row.get("location", {})
+        extracted.update(
+            {key: location.get(value) for key, value in config.location_mappings.items()}
+        )
+        submitter = row.get("submitter", {})
+        extracted.update(
+            {key: submitter.get(value) for key, value in config.submitter_mappings.items()}
+        )
+        isolate = row.get("isolate", {})
+        extracted.update(
+            {key: isolate.get(value) for key, value in config.isolate_mappings.items()}
+        )
+
+        host_lineage = row.get("host", {})
+        extracted.update(
+            {key: host_lineage.get(value) for key, value in config.host_mappings.items()}
+        )
+
+        virus_lineage = row.get("virus", {})
+        extracted.update(
+            {key: virus_lineage.get(value) for key, value in config.virus_mappings.items()}
+        )
+
+        extracted.update(dict.fromkeys(config.unknown_mappings))
+
+    except KeyError as e:
+        print(f"Missing key: {e}")
+        extracted = {}
+    return extracted
+
+
+def jsonl_to_tsv(jsonl_file: str, tsv_file: str, config: Config) -> None:
+    extracted_rows: list[dict[str, str]] = []
+    with (
+        open(jsonl_file, encoding="utf-8") as infile,
+    ):
+        for line in infile:
+            row = json.loads(line.strip())
+            extracted = extract_fields(row, config)
+            extracted["ncbiSubmitterNames"] = reformat_authors_from_genbank_to_loculus(
+                extracted["ncbiSubmitterNames"], extracted["genbankAccession"]
+            )
+            for field in config.parse_list:
+                if extracted[field]:
+                    extracted[field] = ",".join(extracted[field])
+                else:
+                    extracted[field] = ""
+            extracted_rows.append(extracted)
+    df = pd.DataFrame(extracted_rows)
+    df.to_csv(
+        tsv_file,
+        sep="\t",
+        quoting=csv.QUOTE_NONE,
+        escapechar="\\",
+        index=False,
+        float_format="%.0f",
+    )
+
+
+@click.command()
+@click.option("--config-file", required=True, type=click.Path(exists=True))
+@click.option("--input", required=True, type=click.Path(exists=True))
+@click.option("--output", required=True, type=click.Path())
+@click.option(
+    "--log-level",
+    default="INFO",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
+)
+def main(config_file: str, input: str, output: str, log_level: str) -> None:
+    logger.setLevel(log_level)
+
+    with open(config_file, encoding="utf-8") as file:
+        full_config = yaml.safe_load(file)
+        relevant_config = {key: full_config[key] for key in Config.__annotations__}
+        config = Config(**relevant_config)
+    jsonl_to_tsv(input, output, config=config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py
index 2b766d473..dcfc427c3 100644
--- a/ingest/scripts/group_segments.py
+++ b/ingest/scripts/group_segments.py
@@ -33,7 +33,7 @@
 
 def sort_authors(authors: str) -> str:
     """Sort authors alphabetically"""
-    return ", ".join(sorted(authors.split(", ")))
+    return "; ".join(sorted([author.strip() for author in authors.split(";")]))
 
 
 def values_with_sorted_authors(values: dict[str, str]) -> dict[str, str]:
diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py
index b440a22eb..56fd4a938 100644
--- a/ingest/scripts/prepare_metadata.py
+++ b/ingest/scripts/prepare_metadata.py
@@ -8,13 +8,17 @@
 import hashlib
 import json
 import logging
+import re
 from dataclasses import dataclass
 from pathlib import Path
 
 import click
 import orjsonl
 import pandas as pd
+import pycountry
+import unidecode
 import yaml
+from fuzzywuzzy import fuzz, process
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
@@ -32,22 +36,108 @@ class Config:
     rename: dict[str, str]
     keep: list[str]
     segmented: bool
-
-
-def split_authors(authors: str) -> str:
-    """Split authors by each second comma, then split by comma and reverse
-    So Xi,L.,Yu,X. becomes L. Xi, X. Yu
-    Where first name and last name are separated by no-break space"""
-    single_split = authors.split(",")
-    result = []
-
-    for i in range(0, len(single_split), 2):
-        if i + 1 < len(single_split):
-            result.append(single_split[i + 1].strip() + " " + single_split[i].strip())
-        else:
-            result.append(single_split[i].strip())
-
-    return ", ".join(result)
+    country_codes: dict[str, str]
+    min_score: int
+    administrative_divisions: list[str]
+
+
+def format_geo_loc_admin2(division: str, matched_geo_loc_admin1: str) -> str:
+    """Remove the matched geo_loc_admin1 from the division string and return the rest"""
+    replaced_string = division.replace(matched_geo_loc_admin1, "").strip().rstrip(",")
+    geo_loc_admin2 = [x.strip() for x in replaced_string.split(",") if x.strip()]
+    joint = ", ".join(geo_loc_admin2)
+    return re.sub(r"\s+", " ", joint).strip()
+
+
+def fuzzy_match_geo_loc_admin1(query: str, geo_loc_admin1_list: list[str], config: Config) -> str:
+    """Return highest fuzzy match of query to items in list
+    if score of match>= min_score, match range is 0-100"""
+    for admin_region in config.administrative_divisions:
+        if admin_region.lower() in query.lower():
+            query = query.lower().replace(admin_region.lower(), "")
+            break
+    match, score = process.extractOne(query, geo_loc_admin1_list, scorer=fuzz.partial_ratio)
+    if score >= config.min_score:
+        return match
+    return ""
+
+
+def get_geo_loc_admin1_options(country: str, config: Config) -> tuple[list[str], dict[str, str]]:
+    country_code = config.country_codes.get(country)
+    if not country_code:
+        return [], {}
+    try:
+        geolocadmin1_options = [
+            unidecode.unidecode(division.name)  # pycountry returns non-ASCII characters
+            for division in pycountry.subdivisions.get(country_code=country_code)
+            if division.parent_code is None  # Only get the top level subdivisions
+        ]
+        geolocadmin1_abbreviations = {
+            division.code: unidecode.unidecode(division.name)
+            for division in pycountry.subdivisions.get(country_code=country_code)
+        }
+        geolocadmin1_abbreviations = {
+            abbrev.split("-")[1]: name for abbrev, name in geolocadmin1_abbreviations.items()
+        }
+    except Exception as e:
+        try:
+            # Try to get the historic subdivisions if the current ones don't work
+            geolocadmin1_options = [
+                unidecode.unidecode(division.name)  # pycountry returns non-ASCII characters
+                for division in pycountry.historic_countries.get(country_code=country_code)
+                if division.parent_code is None  # Only get the top level subdivisions
+            ]
+            geolocadmin1_abbreviations = {
+                division.code: unidecode.unidecode(division.name)
+                for division in pycountry.historic_countries.get(country_code=country_code)
+            }
+            geolocadmin1_abbreviations = {
+                abbrev.split("-")[1]: name for abbrev, name in geolocadmin1_abbreviations.items()
+            }
+        except Exception:
+            logger.error(f"Error getting subdivisions for {country}: {e}")
+            return [], {}
+    return geolocadmin1_options, geolocadmin1_abbreviations
+
+
+def get_geoloc(input_string: str, config: Config) -> tuple[str, str, str]:
+    """
+    Takes INSDC geolocation string in format `country: division`
+    Returns country and attempts to split division into geoLocAdmin1 and geoLocAdmin2.
+    1. Use pycountry for official list of geoLocAdmin1 options and abbreviations
+    2. Attempt exact match of division substring (split by ",") to geoLocAdmin1
+    3. Attempt exact match of division substring (split by "\s" or ",") to geoLocAdmin1 abbr
+    4. Attempt fuzzy match of division substring (after removing common administrative_divisions
+        substrings) to geoLocAdmin1
+    5. If no match, return division as geoLocAdmin2
+    """
+    country = input_string.split(":", 1)[0].strip()
+    division = input_string.split(":", 1)[1].strip() if len(input_string.split(":", 1)) == 2 else ""
+
+    geolocadmin1_options, geolocadmin1_abbreviations = get_geo_loc_admin1_options(country, config)
+    if not geolocadmin1_options:
+        return country, division, ""
+
+    # Try to find an exact substring match for subdivision
+    for option in geolocadmin1_options:
+        division_words = [word.strip() for word in division.lower().split(",")]
+        if option.lower() in division_words:
+            return country, option, format_geo_loc_admin2(division, option)
+
+    # Try to find an exact substring match subdivision abbreviation
+    for option, name in geolocadmin1_abbreviations.items():
+        division_words = re.split(r"[,\s]+", division)
+        if option in division_words:
+            return country, name, format_geo_loc_admin2(division, option)
+
+    # Try to find a fuzzy match for subdivision
+    division_words = [name.strip() for name in division.split(",") if name]
+    for division_word in division_words:
+        fuzzy_match = fuzzy_match_geo_loc_admin1(division_word, geolocadmin1_options, config)
+        if fuzzy_match:
+            logger.info(f"Fuzzy matched {division_word} to {fuzzy_match}")
+            return country, fuzzy_match, format_geo_loc_admin2(division, division_word)
+    return country, "", division
 
 
 @click.command()
@@ -98,15 +188,12 @@ def main(
 
     for record in metadata:
         # Transform the metadata
-        try:
-            record["division"] = record[config.compound_country_field].split(":", 1)[1].strip()
-        except IndexError:
-            record["division"] = ""
-        record["country"] = record[config.compound_country_field].split(":", 1)[0].strip()
+        record["country"], record["geoLocAdmin1"], record["geoLocAdmin2"] = get_geoloc(
+            record[config.compound_country_field], config
+        )
         record["submissionId"] = record[config.fasta_id_field]
         record["insdcAccessionBase"] = record[config.fasta_id_field].split(".", 1)[0]
         record["insdcVersion"] = record[config.fasta_id_field].split(".", 1)[1]
-        record["ncbiSubmitterNames"] = split_authors(record["ncbiSubmitterNames"])
         if config.segmented:
             record["segment"] = segments_dict.get(record[config.fasta_id_field], "")
 
diff --git a/ingest/tests/config_cchf/config.yaml b/ingest/tests/config_cchf/config.yaml
index 2ae38b880..65995b7b5 100644
--- a/ingest/tests/config_cchf/config.yaml
+++ b/ingest/tests/config_cchf/config.yaml
@@ -16,7 +16,6 @@ organism: cchf
 rename:
   bioprojects: bioprojectAccession
   country: geoLocCountry
-  division: geoLocAdmin1
   genbankAccession: insdcAccessionFull
   ncbiCollectionDate: sampleCollectionDate
   ncbiHostName: hostNameScientific
diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.json b/ingest/tests/expected_output_cchf/metadata_post_prepare.json
new file mode 100644
index 000000000..f20987c9a
--- /dev/null
+++ b/ingest/tests/expected_output_cchf/metadata_post_prepare.json
@@ -0,0 +1,158 @@
+{
+    "KX013462.1": {
+        "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
+        "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.",
+        "bioprojectAccession": "",
+        "biosampleAccession": "",
+        "geoLocAdmin1": "Astrahanskaja oblast'",
+        "geoLocAdmin2": "",
+        "geoLocCountry": "Russia",
+        "hash": "7d43b0538a13b718babb885c5a985fd8",
+        "hostNameScientific": "Ixodoidea",
+        "hostTaxonId": "297308",
+        "insdcAccessionBase": "KX013462",
+        "insdcAccessionFull": "KX013462.1",
+        "insdcVersion": "1",
+        "isLabHost": "",
+        "ncbiReleaseDate": "2016-12-07T00:00:00Z",
+        "ncbiSourceDb": "GenBank",
+        "ncbiUpdateDate": "2016-12-07T00:00:00Z",
+        "ncbiVirusName": "Orthonairovirus haemorrhagiae",
+        "ncbiVirusTaxId": "3052518",
+        "sampleCollectionDate": "1989",
+        "segment": "L",
+        "specimenCollectorSampleId": "K229_194",
+        "sraRunAccession": "",
+        "submissionId": "KX013462.1"
+    },
+    "KX013463.1": {
+        "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
+        "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.",
+        "bioprojectAccession": "",
+        "biosampleAccession": "",
+        "geoLocAdmin1": "Astrahanskaja oblast'",
+        "geoLocAdmin2": "",
+        "geoLocCountry": "Russia",
+        "hash": "d8ee414037323ed4c3a07ca5d3e6c71e",
+        "hostNameScientific": "Ixodoidea",
+        "hostTaxonId": "297308",
+        "insdcAccessionBase": "KX013463",
+        "insdcAccessionFull": "KX013463.1",
+        "insdcVersion": "1",
+        "isLabHost": "",
+        "ncbiReleaseDate": "2016-12-07T00:00:00Z",
+        "ncbiSourceDb": "GenBank",
+        "ncbiUpdateDate": "2016-12-07T00:00:00Z",
+        "ncbiVirusName": "Orthonairovirus haemorrhagiae",
+        "ncbiVirusTaxId": "3052518",
+        "sampleCollectionDate": "1989",
+        "segment": "M",
+        "specimenCollectorSampleId": "K229_194",
+        "sraRunAccession": "",
+        "submissionId": "KX013463.1"
+    },
+    "KX013464.1": {
+        "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
+        "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.",
+        "bioprojectAccession": "",
+        "biosampleAccession": "",
+        "geoLocAdmin1": "Astrahanskaja oblast'",
+        "geoLocAdmin2": "",
+        "geoLocCountry": "Russia",
+        "hash": "0f73aa76d08ffbe5a4dd96eb8b8de95b",
+        "hostNameScientific": "Ixodoidea",
+        "hostTaxonId": "297308",
+        "insdcAccessionBase": "KX013464",
+        "insdcAccessionFull": "KX013464.1",
+        "insdcVersion": "1",
+        "isLabHost": "",
+        "ncbiReleaseDate": "2016-12-07T00:00:00Z",
+        "ncbiSourceDb": "GenBank",
+        "ncbiUpdateDate": "2016-12-07T00:00:00Z",
+        "ncbiVirusName": "Orthonairovirus haemorrhagiae",
+        "ncbiVirusTaxId": "3052518",
+        "sampleCollectionDate": "1989",
+        "segment": "S",
+        "specimenCollectorSampleId": "K229_194",
+        "sraRunAccession": "",
+        "submissionId": "KX013464.1"
+    },
+    "KX013483.1": {
+        "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
+        "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.",
+        "bioprojectAccession": "",
+        "biosampleAccession": "",
+        "geoLocAdmin1": "",
+        "geoLocAdmin2": "",
+        "geoLocCountry": "Uganda",
+        "hash": "f1875fdd32cb34bf7792f4175604bc3d",
+        "hostNameScientific": "Homo sapiens",
+        "hostTaxonId": "9606",
+        "insdcAccessionBase": "KX013483",
+        "insdcAccessionFull": "KX013483.1",
+        "insdcVersion": "1",
+        "isLabHost": "",
+        "ncbiReleaseDate": "2016-12-07T00:00:00Z",
+        "ncbiSourceDb": "GenBank",
+        "ncbiUpdateDate": "2016-12-07T00:00:00Z",
+        "ncbiVirusName": "Orthonairovirus haemorrhagiae",
+        "ncbiVirusTaxId": "3052518",
+        "sampleCollectionDate": "1958",
+        "segment": "L",
+        "specimenCollectorSampleId": "Nakiwogo",
+        "sraRunAccession": "",
+        "submissionId": "KX013483.1"
+    },
+    "KX013485.1": {
+        "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
+        "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.",
+        "bioprojectAccession": "",
+        "biosampleAccession": "",
+        "geoLocAdmin1": "",
+        "geoLocAdmin2": "",
+        "geoLocCountry": "Uganda",
+        "hash": "831e58883a345a466977c13af3492ee9",
+        "hostNameScientific": "Homo sapiens",
+        "hostTaxonId": "9606",
+        "insdcAccessionBase": "KX013485",
+        "insdcAccessionFull": "KX013485.1",
+        "insdcVersion": "1",
+        "isLabHost": "",
+        "ncbiReleaseDate": "2016-12-07T00:00:00Z",
+        "ncbiSourceDb": "GenBank",
+        "ncbiUpdateDate": "2016-12-07T00:00:00Z",
+        "ncbiVirusName": "Orthonairovirus haemorrhagiae",
+        "ncbiVirusTaxId": "3052518",
+        "sampleCollectionDate": "1958",
+        "segment": "S",
+        "specimenCollectorSampleId": "Nakiwogo",
+        "sraRunAccession": "",
+        "submissionId": "KX013485.1"
+    },
+    "KX096703.1": {
+        "authorAffiliations": "Public Health England, Research",
+        "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.",
+        "bioprojectAccession": "",
+        "biosampleAccession": "",
+        "geoLocAdmin1": "",
+        "geoLocAdmin2": "Sairam district",
+        "geoLocCountry": "Kazakhstan",
+        "hash": "77de1141bacb559ea14c89c351df8372",
+        "hostNameScientific": "Hyalomma anatolicum",
+        "hostTaxonId": "176092",
+        "insdcAccessionBase": "KX096703",
+        "insdcAccessionFull": "KX096703.1",
+        "insdcVersion": "1",
+        "isLabHost": "",
+        "ncbiReleaseDate": "2016-04-30T00:00:00Z",
+        "ncbiSourceDb": "GenBank",
+        "ncbiUpdateDate": "2016-04-30T00:00:00Z",
+        "ncbiVirusName": "Orthonairovirus haemorrhagiae",
+        "ncbiVirusTaxId": "3052518",
+        "sampleCollectionDate": "2015",
+        "segment": "S",
+        "specimenCollectorSampleId": "tick pool #134",
+        "sraRunAccession": "",
+        "submissionId": "KX096703.1"
+    }
+}
\ No newline at end of file
diff --git a/ingest/tests/test_data_cchf/metadata_post_extract.tsv b/ingest/tests/test_data_cchf/metadata_post_extract.tsv
deleted file mode 100644
index 5681090d4..000000000
--- a/ingest/tests/test_data_cchf/metadata_post_extract.tsv
+++ /dev/null
@@ -1,8 +0,0 @@
-Accession	BioProjects	BioSample accession	Completeness	Gene count	Geographic Location	Geographic Region	Host Common Name	Host Infraspecific Names Breed	Host Infraspecific Names Cultivar	Host Infraspecific Names Ecotype	Host Infraspecific Names Isolate	Host Infraspecific Names Sex	Host Infraspecific Names Strain	Host Name	Host Pangolin Classification	Host Taxonomic ID	Is Annotated	Is Complete	Is Lab Host	Is Vaccine Strain	Isolate Collection date	Isolate Lineage	Isolate Lineage source	Lab Host	Length	Mature peptide count	Molecule type	Nucleotide completeness	Protein count	Purpose of Sampling	Release date	Source database	SRA Accessions	Submitter Affiliation	Submitter Country	Submitter Names	Update date	Virus Common Name	Virus Infraspecific Names Breed	Virus Infraspecific Names Cultivar	Virus Infraspecific Names Ecotype	Virus Infraspecific Names Isolate	Virus Infraspecific Names Sex	Virus Infraspecific Names Strain	Virus Name	Virus Pangolin Classification	Virus Taxonomic ID
-KX013462.1			PARTIAL	1	Russia: Astrakhan	Europe								Ixodoidea		297308	true				1989	K229_194			12109				1		2016-12-07T00:00:00Z	GenBank		Chumakov Institute of Poliomyelitis and Viral Encephalitides	Russia	Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P.	2016-12-07T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
-KX013463.1			PARTIAL	1	Russia: Astrakhan	Europe								Ixodoidea		297308	true				1989	K229_194			5343				1		2016-12-07T00:00:00Z	GenBank		Chumakov Institute of Poliomyelitis and Viral Encephalitides	Russia	Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P.	2016-12-07T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
-KX013464.1			PARTIAL	1	Russia: Astrakhan	Europe								Ixodoidea		297308	true				1989	K229_194			1641				1		2016-12-07T00:00:00Z	GenBank		Chumakov Institute of Poliomyelitis and Viral Encephalitides	Russia	Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P.	2016-12-07T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
-KX096703.1			PARTIAL	1	Kazakhstan: Sairam district	Asia								Hyalomma anatolicum		176092	true				2015	tick pool #134			513				1		2016-04-30T00:00:00Z	GenBank		Public Health England, Research		Deryabin,P.,Atshabar,B.,Sansyzbaev,Y.,Berezin,V.,Nurmakhanov,T.,Yeskhojayev,O.,Vilkova,A.,Shevtsov,A.,Hewson,R.,Atkinson,B.	2016-04-30T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
-KX013483.1			PARTIAL	1	Uganda	Africa								Homo sapiens		9606	true				1958	Nakiwogo	blood		12098				1		2016-12-07T00:00:00Z	GenBank		Chumakov Institute of Poliomyelitis and Viral Encephalitides	Russia	Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P.	2016-12-07T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
-KX013485.1			PARTIAL	1	Uganda	Africa								Homo sapiens		9606	true				1958	Nakiwogo	blood		1571				1		2016-12-07T00:00:00Z	GenBank		Chumakov Institute of Poliomyelitis and Viral Encephalitides	Russia	Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P.	2016-12-07T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
-INSDC001.1			PARTIAL	1	Uganda	Africa								Homo sapiens		9606	true				1958	Nakiwogo	blood		1571				1		2016-12-07T00:00:00Z	GenBank		Chumakov Institute of Poliomyelitis and Viral Encephalitides	Russia	Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P.	2016-12-07T00:00:00Z								Orthonairovirus haemorrhagiae		3052518
diff --git a/ingest/tests/test_data_cchf/ncbi_dataset.zip b/ingest/tests/test_data_cchf/ncbi_dataset.zip
new file mode 100644
index 000000000..a94f72b44
Binary files /dev/null and b/ingest/tests/test_data_cchf/ncbi_dataset.zip differ
diff --git a/ingest/tests/test_ingest.py b/ingest/tests/test_ingest.py
index d469efb79..543f66d4a 100644
--- a/ingest/tests/test_ingest.py
+++ b/ingest/tests/test_ingest.py
@@ -62,6 +62,7 @@ def test_snakemake():
     destination_directory = CONFIG_DIR
     source_directory = TEST_DATA_DIR / "config_cchf"
     copy_files(source_directory, destination_directory)
+    run_snakemake("extract_ncbi_dataset_sequences", touch=True)  # Ignore sequences for now
     run_snakemake("get_loculus_depositions", touch=True)  # Do not call_loculus
     run_snakemake("group_segments")
     run_snakemake("get_previous_submissions", touch=True)  # Do not call_loculus
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 9829b3355..f366acf56 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -424,7 +424,6 @@ defaultOrganismConfig: &defaultOrganismConfig
         autocomplete: true
         initiallyVisible: true
         header: Sample details
-        ingest: division
       - name: geoLocAdmin2
         displayName: Collection subdivision level 2
         generateIndex: true
@@ -452,6 +451,10 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: Authors
         truncateColumnDisplayTo: 15
         ingest: ncbiSubmitterNames
+        preprocessing:
+          function: check_authors
+          inputs:
+            authors: authors
       - name: authorAffiliations
         displayName: Author affiliations
         generateIndex: true
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 216d2b614..bced3c16a 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -5,6 +5,7 @@
 
 import json
 import logging
+import re
 from datetime import datetime
 
 import dateutil.parser as dateutil
@@ -45,6 +46,39 @@ def invalid_value_annotation(input_datum, output_field, value_type) -> Processin
     )
 
 
+def valid_authors(authors: str) -> bool:
+    pattern = r"^([a-zA-Z\s\.\-\']+,[a-zA-Z\s\.\-\']*;)*([a-zA-Z\s\.\-\']+,[a-zA-Z\s\.\-\']*;*)$"
+    return re.match(pattern, authors) is not None
+
+
+def warn_potentially_invalid_authors(authors: str) -> bool:
+    authors_split = re.split(r"[,\s]+", authors)
+    return bool(";" not in authors and len(authors_split) > 3)
+
+
+def format_authors(authors: str) -> bool:
+    authors_list = [author for author in authors.split(";") if author]
+    loculus_authors = []
+    for author in authors_list:
+        author_single_white_space = re.sub(r"\s\s+", " ", author)
+        last_name, first_name = (
+            author_single_white_space.split(",")[0].strip(),
+            author.split(",")[1].strip(),
+        )
+        # Add dot after initials in first name
+        first_names = []
+        for name in first_name.split():
+            if len(name) == 1:
+                first_names.append(f"{name.upper()}.")
+            elif len(name) == 2 and name[1] == ".":
+                first_names.append(f"{name.upper()}")
+            else:
+                first_names.append(name)
+        first_name = " ".join(first_names)
+        loculus_authors.append(f"{last_name}, {first_name}")
+    return "; ".join(loculus_authors).strip()
+
+
 class ProcessingFunctions:
     @classmethod
     def call_function(
@@ -398,6 +432,91 @@ def concatenate(
                 warnings=warnings,
             )
 
+    @staticmethod
+    def check_authors(
+        input_data: InputMetadata, output_field: str, args: FunctionArgs = None
+    ) -> ProcessingResult:
+        authors = input_data["authors"]
+
+        author_format_description = (
+            "Please ensure that "
+            "authors are separated by semi-colons. Each author's name should be in the format "
+            "'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to "
+            "separate first names/initials from last name. Only ASCII alphabetical characters A-Z "
+            "are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' "
+            "or 'Xu,;' if the first name is unknown."
+        )
+        warnings: list[ProcessingAnnotation] = []
+        errors: list[ProcessingAnnotation] = []
+
+        if not authors:
+            return ProcessingResult(
+                datum=None,
+                warnings=warnings,
+                errors=errors,
+            )
+        try:
+            authors.encode("ascii")
+        except UnicodeEncodeError:
+            error_message = (
+                f"The authors list '{authors}' contains non-ASCII characters. "
+                + author_format_description
+            )
+            return ProcessingResult(
+                datum=None,
+                errors=[
+                    ProcessingAnnotation(
+                        source=[
+                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                        ],
+                        message=error_message,
+                    )
+                ],
+                warnings=warnings,
+            )
+        if valid_authors(authors):
+            formatted_authors = format_authors(authors)
+            if warn_potentially_invalid_authors(authors):
+                warning_message = (
+                    f"The authors list '{authors}' might not be using the Loculus format. "
+                    + author_format_description
+                )
+                warnings = [
+                    ProcessingAnnotation(
+                        source=[
+                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                        ],
+                        message=warning_message,
+                    )
+                ]
+                return ProcessingResult(
+                    datum=formatted_authors,
+                    warnings=warnings,
+                    errors=errors,
+                )
+            return ProcessingResult(
+                datum=formatted_authors,
+                warnings=warnings,
+                errors=errors,
+            )
+
+        error_message = (
+            f"The authors list '{authors}' is not in a recognized format. "
+            + author_format_description
+        )
+        return ProcessingResult(
+            datum=None,
+            errors=[
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                    ],
+                    message=error_message,
+                )
+            ],
+            warnings=warnings,
+        )
+
     @staticmethod
     def identity(
         input_data: InputMetadata, output_field: str, args: FunctionArgs = None
diff --git a/preprocessing/nextclade/tests/test.py b/preprocessing/nextclade/tests/test.py
index 1b77028aa..a2f9e6da4 100644
--- a/preprocessing/nextclade/tests/test.py
+++ b/preprocessing/nextclade/tests/test.py
@@ -9,6 +9,7 @@
 )
 from loculus_preprocessing.prepro import process_all
 from loculus_preprocessing.processing_functions import format_frameshift, format_stop_codon
+from loculus_preprocessing.processing_functions import valid_authors, format_authors
 
 test_config_file = "tests/test_config.yaml"
 
@@ -296,6 +297,55 @@ def get_test_cases(config: Config) -> list[TestCase]:
                 ],
             ),
         ),
+        TestCase(
+            name="warn_potential_author_error",
+            input=UnprocessedEntryFactory.create_unprocessed_entry(
+                metadata_dict={
+                    "submissionId": "warn_potential_author_error",
+                    "name_required": "name",
+                    "required_collection_date": "2022-11-01",
+                    "authors": "Anna Smith, Cameron Tucker",
+                }
+            ),
+            expected_output=factory_custom.create_processed_entry(
+                metadata_dict={
+                    "name_required": "name",
+                    "required_collection_date": "2022-11-01",
+                    "concatenated_string": "LOC_12.1/2022-11-01",
+                    "authors": "Anna Smith, Cameron Tucker",
+                },
+                metadata_warnings=[
+                    (
+                        "authors",
+                        "The authors list 'Anna Smith, Cameron Tucker' might not be using the Loculus format. Please ensure that authors are separated by semi-colons. Each author's name should be in the format 'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to separate first names/initials from last name. Only ASCII alphabetical characters A-Z are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' or 'Xu,;' if the first name is unknown."
+                    ),
+                ],
+            ),
+        ),
+        TestCase(
+            name="non_ascii_authors",
+            input=UnprocessedEntryFactory.create_unprocessed_entry(
+                metadata_dict={
+                    "submissionId": "non_ascii_authors",
+                    "name_required": "name",
+                    "required_collection_date": "2022-11-01",
+                    "authors": "Møller, Anäis; Pérez, José",
+                }
+            ),
+            expected_output=factory_custom.create_processed_entry(
+                metadata_dict={
+                    "name_required": "name",
+                    "required_collection_date": "2022-11-01",
+                    "concatenated_string": "LOC_13.1/2022-11-01",
+                },
+                metadata_errors=[
+                    (
+                        "authors",
+                        "The authors list 'Møller, Anäis; Pérez, José' contains non-ASCII characters. Please ensure that authors are separated by semi-colons. Each author's name should be in the format 'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to separate first names/initials from last name. Only ASCII alphabetical characters A-Z are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' or 'Xu,;' if the first name is unknown."
+                    ),
+                ],
+            ),
+        ),
     ]
 
 
@@ -303,7 +353,53 @@ def sort_annotations(annotations: list[ProcessingAnnotation]):
     return sorted(annotations, key=lambda x: (x.source[0].name, x.message))
 
 
+accepted_authors = {
+    "Xi, L.; Yu, X.;": "Xi, L.; Yu, X.",
+    "Xi,L;Yu,X.;": "Xi, L.; Yu, X.",
+    "Xi,;Yu,X.;": "Xi, ; Yu, X.",
+    "Xi, ;Yu,X.;": "Xi, ; Yu, X.",
+    "Xi, ;Yu,X.": "Xi, ; Yu, X.",
+    "Xi,;": "Xi,",
+    "Xi,": "Xi,",
+    "Smith, Anna Maria; Perez, Jose X.;": "Smith, Anna Maria; Perez, Jose X.",
+    "Smith,Anna Maria;Perez,Jose X;": "Smith, Anna Maria; Perez, Jose X.",
+    "de souza, a.": "de souza, A.",
+    "McGregor, Ewan": "McGregor, Ewan",
+}
+not_accepted_authors = [
+    ";",
+    ",;",
+    ",X.;Yu,X.",
+    ",;Yu,X.",
+    "Anna Maria Smith; Jose X. Perez",
+    "Anna Maria Smith;",
+    "Anna Maria Smith",
+    "Smith9, Anna;",
+    "Anna Smith, Cameron Tucker, and Jose Perez",
+]
+
+
 class PreprocessingTests(unittest.TestCase):
+    def test_valid_authors(self) -> None:
+        for author in accepted_authors:
+            if valid_authors(author) is not True:
+                msg = f"{author} should be accepted but is not."
+                raise AssertionError(msg)
+        for author in not_accepted_authors:
+            if valid_authors(author) is not False:
+                msg = f"{author} should not be accepted but is."
+                raise AssertionError(msg)
+
+    def test_format_authors(self) -> None:
+        for author, formatted_author in accepted_authors.items():
+            if format_authors(author) != formatted_author:
+                print(format_authors(author))
+                msg = (
+                    f"{author} is not formatted: '{format_authors(author)}' "
+                    f"as expected: '{formatted_author}'"
+                )
+                raise AssertionError(msg)
+
     def test_process_all(self) -> None:
         config: Config = get_config(test_config_file)
         test_cases = get_test_cases(config=config)
diff --git a/preprocessing/nextclade/tests/test_config.yaml b/preprocessing/nextclade/tests/test_config.yaml
index ba4327c08..526d2337d 100644
--- a/preprocessing/nextclade/tests/test_config.yaml
+++ b/preprocessing/nextclade/tests/test_config.yaml
@@ -73,4 +73,8 @@ processing_spec:
       type: [string, string, date]
     inputs:
       continent: continent
-      required_collection_date: required_collection_date
\ No newline at end of file
+      required_collection_date: required_collection_date
+  authors:
+    function: check_authors
+    inputs:
+      authors: authors
\ No newline at end of file