Skip to content

Commit

Permalink
script to extract list of invasive plant species; script to extract i…
Browse files Browse the repository at this point in the history
…nvasive insect species and append those in U.S to reduced_insects_labelmap
  • Loading branch information
hzmann committed Nov 26, 2024
1 parent c53bd1f commit c2c788f
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 0 deletions.
55 changes: 55 additions & 0 deletions api/scripts/add_invasive_insects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd

# File paths
original_labelmap_path = r"C:\SDD\PolliNation\api\classifiers\aiy_insects_V1_labelmap.csv"
reduced_labelmap_path = r"C:\SDD\PolliNation\api\scripts\reduced_insects_labelmap.csv"

# Scrape invasive insect list from website
def scrape_invasive_insects():
url = "https://cisr.ucr.edu/invasive-species/scientific-names"
response = requests.get(url)
response.raise_for_status() # Ensure the request was successful
soup = BeautifulSoup(response.text, "html.parser")

# Find all <em> tags within <li> elements (as this is where each scientific name is stored)
invasive_insects = soup.select("ul li em")

# Extract, clean, and return the scientific names- get only the name preceding the comma and strip of white sp
def clean_name(name):
words = name.split()
# Return only the first two words of scientific name
if len(words) >= 2:
return " ".join(words[:2])
elif words: # Case where only one word
return words[0]
else: # Empty or invalid
return None

invasive_scientific_names = [clean_name(insect.text) for insect in invasive_insects]
return invasive_scientific_names

# Load the labelmaps
original_labelmap = pd.read_csv(original_labelmap_path)
reduced_labelmap = pd.read_csv(reduced_labelmap_path)

# Step 3: Check and update reduced labelmap
def update_reduced_labelmap(original_labelmap, reduced_labelmap, invasive_names):
# Lowercase the scientific names for case-insensitive comparison
original_labelmap["name_lower"] = original_labelmap["name"].str.lower()
reduced_labelmap["name_lower"] = reduced_labelmap["name"].str.lower()

# Find matches between invasive names and the original labelmap
matches = original_labelmap[original_labelmap["name_lower"].isin([name.lower() for name in invasive_names])]

# Exclude names already in the reduced labelmap
new_entries = matches[~matches["name_lower"].isin(reduced_labelmap["name_lower"])]

# Append the new entries to the reduced labelmap and drop unnecessary columns
updated_labelmap = pd.concat([reduced_labelmap, new_entries.drop(columns=["name_lower"])], ignore_index=True)
return updated_labelmap

invasive_insects = scrape_invasive_insects()
updated_labelmap = update_reduced_labelmap(original_labelmap, reduced_labelmap, invasive_insects)
updated_labelmap.drop(columns=["name_lower"], errors="ignore").to_csv(reduced_labelmap_path, index=False)
48 changes: 48 additions & 0 deletions api/scripts/further_reduce_plants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
The reduced list of plants is still far too large to successfully make API calls with for every species.
This script is to identify any plants in nature-id's reduced labelmap that fall under the category of the United State's most invasive
plants. For whatever number remains until 200, it will then randomly select that amount of native plants from the reduced labelmap to produce a
labelmap of 200 plant species.
"""
import requests
from bs4 import BeautifulSoup

# List of URLs to scrape- only two relevant pages on website
urls = [
"https://www.invasivespeciesinfo.gov/terrestrial/plants?page=0",
"https://www.invasivespeciesinfo.gov/terrestrial/plants?page=1"
]

# Function to extract scientific names from a single page
def extract_scientific_names(url):
response = requests.get(url)
response.raise_for_status() # Ensure the request was successful
soup = BeautifulSoup(response.text, "html.parser")

# Locate the <em> tags inside the <div> structure corresponding to each scientific name
plant_items = soup.select("div.usa-card__body div.field--name-species-profile-scientific-name em")

# Extract and return the scientific names
scientific_names = [item.text.strip() for item in plant_items]
return scientific_names

# List to store all scientific names
all_scientific_names = []

# Scrape each hardcoded URL
for url in urls:
names = extract_scientific_names(url)
all_scientific_names.extend(names)

# Output the results
print("Extracted Scientific Names:")
for name in all_scientific_names:
print(name)

# Optionally save the names to a file
output_file = "invasive_plant_species.txt"
with open(output_file, "w") as f:
for name in all_scientific_names:
f.write(name + "\n")

print(f"Scientific names saved to {output_file}")
60 changes: 60 additions & 0 deletions api/scripts/invasive_plant_species.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
Dioscorea bulbifera
Elaeagnus umbellata Thunb.
Vitex rotundifolia L. f.
Schinus terebinthifolius
Inula britannica
Cirsium arvense
Ligustrum sinense
Triadica sebifera (L.) Small
Imperata cylindrica
Rhamnus cathartica
Dipsacus fullonum
Linaria dalmatica
Centaurea diffusa
Bromus tectorum
Hedera
Ficaria verna
Ranunculus ficaria
Alliaria petiolata
Heracleum mantegazzianum
Phyllostachys aurea
Lepidium appelianum
Cynoglossum officinale
Berberis thunbergii
Lygodium japonicum
Lonicera japonica
Fallopia japonica
Spiraea japonica
Microstegium vimineum
Sorghum halepense
Pueraria montana
lobata
Euphorbia esula
Taeniatherum caput-medusae
Persicaria perfoliata
Polygonum perfoliatum
Rosa multiflora
Carduus nutans
Lygodium microphyllum
Celastrus orbiculatus
Amaranthus palmeri
Paulownia tomentosa
Centaurea calcitrapa
Elymus repens
repens
Rhaponticum repens
Elaeagnus angustifolia
Nandina domestica
Tamarix
Cytisus scoparius
Onopordum acanthium
Centaurea stoebe
micranthos
Hypericum perforatum
Ailanthus altissima
Solanum viarum
Lepidium draba
Cardaria draba
Striga asiatica
Centaurea solstitialis
Linaria vulgaris
1 change: 1 addition & 0 deletions api/scripts/reduced_insects_labelmap.csv
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,4 @@ id,name
1009,Malacosoma californicum
1011,Epargyreus clarus
1013,Polistes dominula
607,Bagrada hilaris

0 comments on commit c2c788f

Please sign in to comment.