-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
script to extract list of invasive plant species; script to extract i…
…nvasive insect species and append those in U.S to reduced_insects_labelmap
- Loading branch information
Showing
4 changed files
with
164 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
|
||
# File paths | ||
original_labelmap_path = r"C:\SDD\PolliNation\api\classifiers\aiy_insects_V1_labelmap.csv" | ||
reduced_labelmap_path = r"C:\SDD\PolliNation\api\scripts\reduced_insects_labelmap.csv" | ||
|
||
# Scrape invasive insect list from website | ||
def scrape_invasive_insects(): | ||
url = "https://cisr.ucr.edu/invasive-species/scientific-names" | ||
response = requests.get(url) | ||
response.raise_for_status() # Ensure the request was successful | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
|
||
# Find all <em> tags within <li> elements (as this is where each scientific name is stored) | ||
invasive_insects = soup.select("ul li em") | ||
|
||
# Extract, clean, and return the scientific names- get only the name preceding the comma and strip of white sp | ||
def clean_name(name): | ||
words = name.split() | ||
# Return only the first two words of scientific name | ||
if len(words) >= 2: | ||
return " ".join(words[:2]) | ||
elif words: # Case where only one word | ||
return words[0] | ||
else: # Empty or invalid | ||
return None | ||
|
||
invasive_scientific_names = [clean_name(insect.text) for insect in invasive_insects] | ||
return invasive_scientific_names | ||
|
||
# Load the labelmaps | ||
original_labelmap = pd.read_csv(original_labelmap_path) | ||
reduced_labelmap = pd.read_csv(reduced_labelmap_path) | ||
|
||
# Step 3: Check and update reduced labelmap | ||
def update_reduced_labelmap(original_labelmap, reduced_labelmap, invasive_names): | ||
# Lowercase the scientific names for case-insensitive comparison | ||
original_labelmap["name_lower"] = original_labelmap["name"].str.lower() | ||
reduced_labelmap["name_lower"] = reduced_labelmap["name"].str.lower() | ||
|
||
# Find matches between invasive names and the original labelmap | ||
matches = original_labelmap[original_labelmap["name_lower"].isin([name.lower() for name in invasive_names])] | ||
|
||
# Exclude names already in the reduced labelmap | ||
new_entries = matches[~matches["name_lower"].isin(reduced_labelmap["name_lower"])] | ||
|
||
# Append the new entries to the reduced labelmap and drop unnecessary columns | ||
updated_labelmap = pd.concat([reduced_labelmap, new_entries.drop(columns=["name_lower"])], ignore_index=True) | ||
return updated_labelmap | ||
|
||
invasive_insects = scrape_invasive_insects() | ||
updated_labelmap = update_reduced_labelmap(original_labelmap, reduced_labelmap, invasive_insects) | ||
updated_labelmap.drop(columns=["name_lower"], errors="ignore").to_csv(reduced_labelmap_path, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
""" | ||
The reduced list of plants is still far too large to successfully make API calls with for every species. | ||
This script is to identify any plants in nature-id's reduced labelmap that fall under the category of the United State's most invasive | ||
plants. For whatever number remains until 200, it will then randomly select that amount of native plants from the reduced labelmap to produce a | ||
labelmap of 200 plant species. | ||
""" | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
# List of URLs to scrape- only two relevant pages on website | ||
urls = [ | ||
"https://www.invasivespeciesinfo.gov/terrestrial/plants?page=0", | ||
"https://www.invasivespeciesinfo.gov/terrestrial/plants?page=1" | ||
] | ||
|
||
# Function to extract scientific names from a single page | ||
def extract_scientific_names(url): | ||
response = requests.get(url) | ||
response.raise_for_status() # Ensure the request was successful | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
|
||
# Locate the <em> tags inside the <div> structure corresponding to each scientific name | ||
plant_items = soup.select("div.usa-card__body div.field--name-species-profile-scientific-name em") | ||
|
||
# Extract and return the scientific names | ||
scientific_names = [item.text.strip() for item in plant_items] | ||
return scientific_names | ||
|
||
# List to store all scientific names | ||
all_scientific_names = [] | ||
|
||
# Scrape each hardcoded URL | ||
for url in urls: | ||
names = extract_scientific_names(url) | ||
all_scientific_names.extend(names) | ||
|
||
# Output the results | ||
print("Extracted Scientific Names:") | ||
for name in all_scientific_names: | ||
print(name) | ||
|
||
# Optionally save the names to a file | ||
output_file = "invasive_plant_species.txt" | ||
with open(output_file, "w") as f: | ||
for name in all_scientific_names: | ||
f.write(name + "\n") | ||
|
||
print(f"Scientific names saved to {output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
Dioscorea bulbifera | ||
Elaeagnus umbellata Thunb. | ||
Vitex rotundifolia L. f. | ||
Schinus terebinthifolius | ||
Inula britannica | ||
Cirsium arvense | ||
Ligustrum sinense | ||
Triadica sebifera (L.) Small | ||
Imperata cylindrica | ||
Rhamnus cathartica | ||
Dipsacus fullonum | ||
Linaria dalmatica | ||
Centaurea diffusa | ||
Bromus tectorum | ||
Hedera | ||
Ficaria verna | ||
Ranunculus ficaria | ||
Alliaria petiolata | ||
Heracleum mantegazzianum | ||
Phyllostachys aurea | ||
Lepidium appelianum | ||
Cynoglossum officinale | ||
Berberis thunbergii | ||
Lygodium japonicum | ||
Lonicera japonica | ||
Fallopia japonica | ||
Spiraea japonica | ||
Microstegium vimineum | ||
Sorghum halepense | ||
Pueraria montana | ||
lobata | ||
Euphorbia esula | ||
Taeniatherum caput-medusae | ||
Persicaria perfoliata | ||
Polygonum perfoliatum | ||
Rosa multiflora | ||
Carduus nutans | ||
Lygodium microphyllum | ||
Celastrus orbiculatus | ||
Amaranthus palmeri | ||
Paulownia tomentosa | ||
Centaurea calcitrapa | ||
Elymus repens | ||
repens | ||
Rhaponticum repens | ||
Elaeagnus angustifolia | ||
Nandina domestica | ||
Tamarix | ||
Cytisus scoparius | ||
Onopordum acanthium | ||
Centaurea stoebe | ||
micranthos | ||
Hypericum perforatum | ||
Ailanthus altissima | ||
Solanum viarum | ||
Lepidium draba | ||
Cardaria draba | ||
Striga asiatica | ||
Centaurea solstitialis | ||
Linaria vulgaris |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,3 +167,4 @@ id,name | |
1009,Malacosoma californicum | ||
1011,Epargyreus clarus | ||
1013,Polistes dominula | ||
607,Bagrada hilaris |