Skip to content

Commit

Permalink
Add test for the crawl meta parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
eliasdabbas committed Aug 17, 2024
1 parent 52577f3 commit 34b6d83
Showing 1 changed file with 25 additions and 1 deletion.
26 changes: 25 additions & 1 deletion tests/test_crawl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import platform
from pathlib import Path
from tempfile import TemporaryDirectory
from tempfile import TemporaryDirectory, TemporaryFile

import pandas as pd
import pytest
Expand Down Expand Up @@ -172,3 +172,27 @@ def test_crawling_bad_url_directly_is_handled():
bad_url_df = pd.read_json(f"{broken_links_tempdir}/bad_url.jl", lines=True)
assert len(bad_url_df) == 1
assert bad_url_df["url"][0] == "https://example.com"


with TemporaryDirectory() as meta_parameter_dir:

def test_meta_keys_correctly_populated():
crawl(
url_list="https://example.com",
output_file=f"{meta_parameter_dir}/output.jsonl",
meta={
"foo": "bar",
"custom_headers": {
"https://example.com": {
"If-None-Match": "XXXYYYZZZ",
"Blah": "blew",
}
},
},
)

crawl_df = pd.read_json(f"{meta_parameter_dir}/output.jsonl", lines=True)
assert "foo" in crawl_df
assert "request_headers_If-None-Match" in crawl_df
assert crawl_df["foo"][0] == "bar"
assert crawl_df["request_headers_Blah"][0] == "blew"

0 comments on commit 34b6d83

Please sign in to comment.