script to extract list of invasive plant species; script to extract i…

…nvasive insect species and append those in U.S to reduced_insects_labelmap
TheBeetles · Nov 26, 2024 · c2c788f · c2c788f
1 parent c53bd1f
commit c2c788f
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 0 deletions.
diff --git a/api/scripts/add_invasive_insects.py b/api/scripts/add_invasive_insects.py
@@ -0,0 +1,55 @@
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+
+# File paths
+original_labelmap_path = r"C:\SDD\PolliNation\api\classifiers\aiy_insects_V1_labelmap.csv"
+reduced_labelmap_path = r"C:\SDD\PolliNation\api\scripts\reduced_insects_labelmap.csv"
+
+# Scrape invasive insect list from website
+def scrape_invasive_insects():
+    url = "https://cisr.ucr.edu/invasive-species/scientific-names"
+    response = requests.get(url)
+    response.raise_for_status()  # Ensure the request was successful
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Find all <em> tags within <li> elements (as this is where each scientific name is stored)
+    invasive_insects = soup.select("ul li em")
+
+    # Extract, clean, and return the scientific names- get only the name preceding the comma and strip of white sp
+    def clean_name(name):
+        words = name.split()
+        # Return only the first two words of scientific name
+        if len(words) >= 2:
+            return " ".join(words[:2])
+        elif words: # Case where only one word
+            return words[0]
+        else: # Empty or invalid
+            return None
+
+    invasive_scientific_names = [clean_name(insect.text) for insect in invasive_insects]
+    return invasive_scientific_names
+
+# Load the labelmaps
+original_labelmap = pd.read_csv(original_labelmap_path)
+reduced_labelmap = pd.read_csv(reduced_labelmap_path)
+
+# Step 3: Check and update reduced labelmap
+def update_reduced_labelmap(original_labelmap, reduced_labelmap, invasive_names):
+    # Lowercase the scientific names for case-insensitive comparison
+    original_labelmap["name_lower"] = original_labelmap["name"].str.lower()
+    reduced_labelmap["name_lower"] = reduced_labelmap["name"].str.lower()
+
+    # Find matches between invasive names and the original labelmap
+    matches = original_labelmap[original_labelmap["name_lower"].isin([name.lower() for name in invasive_names])]
+
+    # Exclude names already in the reduced labelmap
+    new_entries = matches[~matches["name_lower"].isin(reduced_labelmap["name_lower"])]
+
+    # Append the new entries to the reduced labelmap and drop unnecessary columns
+    updated_labelmap = pd.concat([reduced_labelmap, new_entries.drop(columns=["name_lower"])], ignore_index=True)
+    return updated_labelmap
+
+invasive_insects = scrape_invasive_insects()
+updated_labelmap = update_reduced_labelmap(original_labelmap, reduced_labelmap, invasive_insects)
+updated_labelmap.drop(columns=["name_lower"], errors="ignore").to_csv(reduced_labelmap_path, index=False)
diff --git a/api/scripts/further_reduce_plants.py b/api/scripts/further_reduce_plants.py
@@ -0,0 +1,48 @@
+"""
+The reduced list of plants is still far too large to successfully make API calls with for every species.
+This script is to identify any plants in nature-id's reduced labelmap that fall under the category of the United State's most invasive
+plants. For whatever number remains until 200, it will then randomly select that amount of native plants from the reduced labelmap to produce a
+labelmap of 200 plant species.
+"""
+import requests
+from bs4 import BeautifulSoup
+
+# List of URLs to scrape- only two relevant pages on website
+urls = [
+    "https://www.invasivespeciesinfo.gov/terrestrial/plants?page=0",
+    "https://www.invasivespeciesinfo.gov/terrestrial/plants?page=1"
+]
+
+# Function to extract scientific names from a single page
+def extract_scientific_names(url):
+    response = requests.get(url)
+    response.raise_for_status()  # Ensure the request was successful
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Locate the <em> tags inside the <div> structure corresponding to each scientific name
+    plant_items = soup.select("div.usa-card__body div.field--name-species-profile-scientific-name em")
+
+    # Extract and return the scientific names
+    scientific_names = [item.text.strip() for item in plant_items]
+    return scientific_names
+
+# List to store all scientific names
+all_scientific_names = []
+
+# Scrape each hardcoded URL
+for url in urls:
+    names = extract_scientific_names(url)
+    all_scientific_names.extend(names)
+
+# Output the results
+print("Extracted Scientific Names:")
+for name in all_scientific_names:
+    print(name)
+
+# Optionally save the names to a file
+output_file = "invasive_plant_species.txt"
+with open(output_file, "w") as f:
+    for name in all_scientific_names:
+        f.write(name + "\n")
+
+print(f"Scientific names saved to {output_file}")
diff --git a/api/scripts/invasive_plant_species.txt b/api/scripts/invasive_plant_species.txt
@@ -0,0 +1,60 @@
+Dioscorea bulbifera
+Elaeagnus umbellata Thunb.
+Vitex rotundifolia L. f.
+Schinus terebinthifolius
+Inula britannica
+Cirsium arvense
+Ligustrum sinense
+Triadica sebifera (L.) Small
+Imperata cylindrica
+Rhamnus cathartica
+Dipsacus fullonum
+Linaria dalmatica
+Centaurea diffusa
+Bromus tectorum
+Hedera
+Ficaria verna
+Ranunculus ficaria
+Alliaria petiolata
+Heracleum mantegazzianum
+Phyllostachys aurea
+Lepidium appelianum
+Cynoglossum officinale
+Berberis thunbergii
+Lygodium japonicum
+Lonicera japonica
+Fallopia japonica
+Spiraea japonica
+Microstegium vimineum
+Sorghum halepense
+Pueraria montana
+lobata
+Euphorbia esula
+Taeniatherum caput-medusae
+Persicaria perfoliata
+Polygonum perfoliatum
+Rosa multiflora
+Carduus nutans
+Lygodium microphyllum
+Celastrus orbiculatus
+Amaranthus palmeri
+Paulownia tomentosa
+Centaurea calcitrapa
+Elymus repens
+repens
+Rhaponticum repens
+Elaeagnus angustifolia
+Nandina domestica
+Tamarix
+Cytisus scoparius
+Onopordum acanthium
+Centaurea stoebe
+micranthos
+Hypericum perforatum
+Ailanthus altissima
+Solanum viarum
+Lepidium draba
+Cardaria draba
+Striga asiatica
+Centaurea solstitialis
+Linaria vulgaris
diff --git a/api/scripts/reduced_insects_labelmap.csv b/api/scripts/reduced_insects_labelmap.csv
@@ -167,3 +167,4 @@ id,name
 1009,Malacosoma californicum
 1011,Epargyreus clarus
 1013,Polistes dominula
+607,Bagrada hilaris