Skip to content

Commit

Permalink
added vsearch
Browse files Browse the repository at this point in the history
  • Loading branch information
pieterprovoost committed Nov 6, 2024
1 parent b115ebe commit 3d3f8f0
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 84 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@ node_modules
npm-debug.log*
yarn-debug.log*
yarn-error.log*

data/sequences.udb
output.txt
vsearch-*
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,21 @@ Build the fasta file, occurrence sqlite database, and bowtie2 database:

```sh
./build_db.sh
```

### Generate vsearch database

```bash
./vsearch-2.29.1-macos-aarch64/bin/vsearch --makeudb_usearch data/sequences.fasta --output data/sequences.udb
```

```bash
./vsearch-2.29.1-macos-aarch64/bin/vsearch --usearch_global perna.fasta --db data/sequences.udb --id 0.95 --query_cov 0.95 --maxaccepts 100 --maxrejects 100 --maxhits 10 --blast6out output.txt
```

### Upload data

```sh
rsync -r data ***@***:/data/sequence-search/data
```

Expand Down
9 changes: 9 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from fastapi import FastAPI
from search import search_sequences
import pandas as pd
from fastapi.middleware.cors import CORSMiddleware


app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

default_sequence = """
TAGTCATATGCTTGTCTCAAAGATAAGCCATGCATGTCTAAGTATAAGCGACTATACTGTGAAACTGCGA
ATGGCTCATTAAATCAGTTATGGTTTATTTGATGGTACCTTGCTACTTGGATAACCGTAGTAATTCTAGA
Expand Down
72 changes: 51 additions & 21 deletions api/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@
logging.basicConfig(level=logging.INFO)


def search_sequences(seq):
def search_sequences(seq, strategy="vsearch"):

with tempfile.TemporaryDirectory() as tmpdir:

logging.info(f"Created temp dir {tmpdir}")

input_file_path = os.path.join(tmpdir, "input.fasta")
sam_file_path = os.path.join(tmpdir, "output.sam")

# generate input file

Expand All @@ -31,33 +30,64 @@ def search_sequences(seq):
)
SeqIO.write([record], fasta_file, "fasta")

# run bowtie2
records = []

os.system(f"bowtie2 -x ../data/db/sequences -U {input_file_path} -S {sam_file_path} -f --local --very-sensitive --no-unal -k 100")
if strategy == "bowtie2":

# sam results
sam_file_path = os.path.join(tmpdir, "output.sam")

records = []
bowtie2_path = "/Users/pieter/IPOfI Dropbox/Pieter Provoost/werk/projects/OBIS pipeline/sequence-search/bowtie2-2.5.0-macos-arm64"
os.environ["PATH"] += os.pathsep + bowtie2_path
os.system(f"bowtie2 -x ../data/db/sequences -U {input_file_path} -S {sam_file_path} -f --local --very-sensitive --no-unal -k 100")

con = sqlite3.connect(sqlite_file_path)
con.row_factory = sqlite3.Row
cur = con.cursor()

samfile = pysam.AlignmentFile(sam_file_path)
for read in samfile.fetch():

res = cur.execute("select * from occurrence where hash = ? limit 100", (read.reference_name,))
rows = res.fetchall()

for row in rows:
record = dict(zip(row.keys(), row))
record["as"] = read.get_tag("AS")
records.append(record)

if len(records) > 500:
break

con.close()
samfile.close()

elif strategy == "vsearch":

blast_file_path = os.path.join(tmpdir, "output.blast")

con = sqlite3.connect(sqlite_file_path)
con.row_factory = sqlite3.Row
cur = con.cursor()
vsearch_path = "/Users/pieter/IPOfI Dropbox/Pieter Provoost/werk/projects/OBIS pipeline/sequence-search/vsearch-2.29.1-macos-aarch64/bin"
os.environ["PATH"] += os.pathsep + vsearch_path
os.system(f"vsearch --usearch_global {input_file_path} --db ../data/sequences.udb --id 0.9 --query_cov 0.9 --maxaccepts 100 --maxrejects 100 --maxhits 100 --blast6out {blast_file_path}")

samfile = pysam.AlignmentFile(sam_file_path)
for read in samfile.fetch():
con = sqlite3.connect(sqlite_file_path)
con.row_factory = sqlite3.Row
cur = con.cursor()

res = cur.execute("select * from occurrence where hash = ? limit 100", (read.reference_name,))
rows = res.fetchall()
with open(blast_file_path, "r") as f:
for line in f:
fields = line.strip().split("\t")
reference = fields[1]
identity = float(fields[2])

for row in rows:
record = dict(zip(row.keys(), row))
record["as"] = read.get_tag("AS")
records.append(record)
res = cur.execute("select * from occurrence where hash = ? limit 100", (reference,))
rows = res.fetchall()

if len(records) > 500:
break
for row in rows:
record = dict(zip(row.keys(), row))
record["as"] = identity
records.append(record)

con.close()
samfile.close()
if len(records) > 500:
break

return records
74 changes: 35 additions & 39 deletions app/src/App.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,51 +19,46 @@ L.Icon.Default.mergeOptions({
shadowUrl: iconShadow,
});

const examples = [
`CCGCGGTAAGACGGAGGATGCAAGTGTTATCCGGAATCACTGGGCGTAAAGCGTCTGTAGGTGGTTTAATAAGTCAACTGTTAAATCTTGAGGCTCAACTTCAAAATCGCAGTCGAAACTATTAGACTAGAGTATAGTAGAGGTAAAGGGAATTTCCAGTGGAGCGGTGAAATGCGTAGATATTGGAAAGAACACCGATGGCGAAAGCACTTTACTGGGCTATTACTAACACTCAGAGACGAAAGCTAGGGTAG`,
`AGCCGCGGTAATTCCAGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATTTCGGAGCGGGCCAGTTGGTCGGCCGCAAGGTCTGTTACTGACTGGTCTGCTCTTCTTCGCAAAGACTGCGTGTGCTCTTGACTGAGTGTGCGTAGGATTTACGACGTTTACTTTGAAAAAATTAGAGTGTTCAAAGCAGGC`,
`TGGTGGAGTGATTTGTCTGGTTAATTCCGTTAACGAACGAGACCTTAACCTGCTAAATAGTTACATGTAATTCCGGTTATGTGGGCAACTTCTTAGAGGGACTTTGTGTGTATAACGCAAGGAAGTTTGAGGCAATAACA`,
`AGGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGGCTAAAATGCCTGAATATTCGTGCATGGAATAATAGAATAGGATGTCGATCCTATTTTGTTGGTTTTCGGGACTCGACATAATGATAAATAGGGACAGTCGGGGGCATTTGTATTCAAACGACAGAGGTGAAATTCTTGGACCGTTTGAAGACAAACTACTGCGAAAGCATTTG`
]

function App() {

const [occurrences, setOccurrences] = useState([]);
const [table, setTable] = useState([]);
const [loading, setLoading] = useState(false);
const [slider, setSlider] = useState(100);
const [maxSlider, ] = useState(1000);
const [sequence, setSequence] = useState(`TAGTCATATGCTTGTCTCAAAGATAAGCCATGCATGTCTAAGTATAAGCGACTATACTGTGAAACTGCGA
ATGGCTCATTAAATCAGTTATGGTTTATTTGATGGTACCTTGCTACTTGGATAACCGTAGTAATTCTAGA
GCTAATACATGCAGGAGTTCCCGACTCACGGAGGGATGTATTTATTAGATAAGAAACCAAACCGGTCTCC
GGTTGCGTGCTGAGTCATAATAACTGCTCGAATCGCACGGCTCTACGCCGGCGATGGTTCATTCAAATTT
CTGCCCTATCAGCTTTCGATGGTAGGATAGAGGCCTACCATGGCGTTAACGGGTAACGGAGAATTAGGGT
TCGATTCCGGAGAGGGAGCCTGAGAAATGGCTACCACATCCAAGGAAGGCAGCAGGCGCGTAAATTGCCC
GAATCCTGACACAGGGAGGTAGTGACAAGAAATAACAATACAGGGCTATTTTAGTCTTGTAATTGGAATG
AGTACAATTTACATCTCTTCACGAGGATCAATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCC
AGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAAACGCTCGTAGTCGGATTTCGGGGCGGGCCGACC
GGTCTGCCGATGGGTATGCACTGGCCGGCGCGTCCTTCCACCCGGAGACCGCGCCTACTCTTAACTGAGC
GGGCGCGGGAGACGGGTCTTTTACTTTGAAAAAATCAGAGTGTTTCAAGCAGGCAGTCGCTCTTGCATGG
ATTAGCATGGGATAATGAAATAGGACTCTGGTGCTATTTTGTTGGTTTCGAACACCGGAGTAATGATTAA
CAGGGACAGTCAGGGGCACTCGTATTCCGCCGAGAGAGGTGAAATTCTCAGACCAGCGGAAGACGAACCA
CTGCGAAAGCATTTGCCAGGGATGTTTTCACTGATCAAGAACGAAAGTTAGGGGATCGAAGACGATCAGA
TACCGTCGTAGTCTTAACCATAAACCATGCCGACTAGGGATTGGAGGATGTTCCATTTGTGACTCCTTCA
GCACCTTTCGGGAAACTAAAGTCTTTGGGTTCCGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAA
TTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTTACCAGG
TCCAGCACATTGTGAGGATTGACAGATTGAGAGCTCTTTCTTGATTCGATGGGTGGTGGTGCATGGCCGT
TCTTAGTTGGTGGAGTGATTTGTCTGGTTAATTCCGTTAACGAACGAGACCGCAGCCTGCTAAATAGCGA
CGCGAACCCTCCGTTCGCTGGAGCTTCTTAGAGGGACAACTTGTCTTCAACAAGTGGAAGTTCGCGGCAA
TAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGATGCACTCAACGAGTCTA
TCACCTTGACCGAGAGGTCCGGGTAATCTTTTGAAATTGCATCGTGATGGGGATAGATTATTGCAACTAT
TAATCTTCAACGAGGAATTCCTAGTAAGCGTGTGTCATCAGCGCACGTTGATTACGTCCCTGCCCTTTGT
ACACACCGCCCGTCGCTCCTACCGATTGAATGATCCGGTGAGGCCCCCGGACTGCGGCGCCGCAGCTGGT
TCTCCAGCCGCGACGCCGCGGGAAGCTGTCCGAACCTTATCATTTAGAGGAAGGAGAAGTCGTAACAAGG`);

function handleSequenceChange(e) {
setSequence(e.target.value);
}
const [slider, setSlider] = useState(95);
const [maxSlider, ] = useState(100);
const [exampleIndex, setExampleIndex] = useState(0);
const [sequence, setSequence] = useState(examples[0]);

function handleSliderChange(value) {
function handleSequenceChange(e) {
setSequence(e.target.value);
}

function handleLoadExample(e) {
if (exampleIndex === examples.length - 1) {
setExampleIndex(0);
setSequence(examples[0]);
} else {
setExampleIndex(exampleIndex + 1);
setSequence(examples[exampleIndex + 1]);
}
}

function handleSliderChange(value) {
setSlider(value);
}

function handleSearch() {
setLoading(true);
async function search() {
const res = await fetch("https://api.sequence.obis.org/search?sequence=" + sequence.trim());
// const res = await fetch("https://api.sequence.obis.org/search?sequence=" + sequence.trim());
const res = await fetch("http://localhost:8000/search?sequence=" + sequence.trim());
const data = await res.json();
setOccurrences(data["results"]);
setTable(data["table"]);
Expand Down Expand Up @@ -122,8 +117,8 @@ function handleSearch() {

{ slider !== null &&
<Control prepend position="bottomleft">
<span>Minimum alignment score: { slider }</span>
<ReactSlider className="slider" thumbClassName="thumb" trackClassName="track" value={slider} max={maxSlider} onAfterChange={(value, index) =>
<span>Identity threshold: { slider }%</span>
<ReactSlider className="slider" thumbClassName="thumb" trackClassName="track" value={slider} min={80} max={maxSlider} onAfterChange={(value, index) =>
handleSliderChange(value)
} />
</Control>
Expand All @@ -143,10 +138,11 @@ function handleSearch() {
<Spinner className="mx-2" animation="border" size="sm"></Spinner>
}
</Button>
<Button className="ms-2" variant="light" onClick={handleLoadExample}>Load example</Button>
</Col>
<Col sm={4}>
<h5>About</h5>
<p>This application aligns sequences against all sequence records in the OBIS database using <a href="https://github.com/BenLangmead/bowtie2" target="_blank" rel="noreferrer">bowtie2</a>. Up to 100 distinct sequences as well as the associated occurrence records are returned ordered by alignment score.</p>
<p>This application aligns sequences against all sequence records in the OBIS database using <a href="https://github.com/torognes/vsearch" target="_blank" rel="noreferrer">VSEARCH</a>. Up to 100 distinct sequences as well as the associated occurrence records are returned ordered by sequence similarity. In the table below, occurrences are grouped by sequence, taxonomy, and dataset.</p>
</Col>
</Row>
<Row>
Expand All @@ -157,7 +153,7 @@ function handleSearch() {
<thead>
<tr>
<th>scientificName</th>
<th>alignment score</th>
<th>identity</th>
<th>phylum</th>
<th>class</th>
<th>order</th>
Expand All @@ -169,13 +165,13 @@ function handleSearch() {
<tbody>
{ table.map((occ, i) => <tr key={i}>
<td>{occ.scientificname}</td>
<td>{occ.as} <span className="bar" style={{width: occ.as / 10}}></span></td>
<td>{occ.as} <span className="bar" style={{width: Math.max(0, (occ.as - 95) * 10)}}></span></td>
<td>{occ.phylum}</td>
<td>{occ.class}</td>
<td>{occ.order}</td>
<td>{occ.family}</td>
<td>{occ.genus}</td>
<td><a href={"https://obis.org/dataset/" + occ.dataset_id} rel="noreferrer" target="_blank">link</a></td>
<td><a href={"https://obis.org/dataset/" + occ.dataset_id} rel="noreferrer" target="_blank">{occ.dataset_id}</a></td>
</tr>) }
</tbody>
</Table> :
Expand Down
47 changes: 24 additions & 23 deletions generate_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import progressbar
# import progressbar
import sqlite3


Expand All @@ -22,20 +22,21 @@
except FileNotFoundError:
pass

with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
with open(fasta_file_path, "w") as fasta_file:
with open(sequence_file_path) as csv_file:
reader = csv.DictReader(csv_file)
count = 0
for row in reader:
record = SeqRecord(
Seq(row["sequence"]),
id=row["hash"],
description=row["hash"]
)
SeqIO.write([record], fasta_file, "fasta")
count = count + 1
bar.update(count)
# bar = progressbar.ProgressBar()
with open(fasta_file_path, "w") as fasta_file:
with open(sequence_file_path) as csv_file:
reader = csv.DictReader(csv_file)
count = 0
for row in reader:
seq = str(row["sequence"]).replace("-", "")
record = SeqRecord(
Seq(seq),
id=row["hash"],
description=row["hash"]
)
SeqIO.write([record], fasta_file, "fasta")
count = count + 1
# bar.update(count)

# sqlite (occurrence IDs)

Expand All @@ -47,14 +48,14 @@
con = sqlite3.connect(sqlite_file_path)
cur = con.cursor()
cur.execute("create table occurrence (hash, decimallongitude real, decimallatitude real, dataset_id, phylum, class, \"order\", family, genus, scientificname, count int)")
with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
with open(occurrence_file_path) as csv_file:
reader = csv.DictReader(csv_file)
count = 0
for row in reader:
cur.execute("insert into occurrence values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", list(row.values()))
count = count + 1
bar.update(count)
# with progressbar.ProgressBar(max_value=progressbar.UnknownLength) as bar:
with open(occurrence_file_path) as csv_file:
reader = csv.DictReader(csv_file)
count = 0
for row in reader:
cur.execute("insert into occurrence values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", list(row.values()))
count = count + 1
# bar.update(count)

cur.execute("create index idx_hash on occurrence (hash)")
con.commit()
Expand Down

0 comments on commit 3d3f8f0

Please sign in to comment.