Skip to content

Commit

Permalink
Add scrape sitting dates
Browse files Browse the repository at this point in the history
  • Loading branch information
limdingwen committed Jul 30, 2024
1 parent d798855 commit a171595
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 0 deletions.
11 changes: 11 additions & 0 deletions scripts/deploy-and-run/scrape-sitting-dates.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -e

source ../.env
source ../lib/functions/deploy.sh
source ../lib/functions/run.sh

FUNCTION_NAME="scrape-sitting-dates"

deploy "$SUPABASE_PROJECT_ID" "$FUNCTION_NAME"
run "$SUPABASE_PROJECT_ID" "$SUPABASE_FUNCTIONS_KEY" "$FUNCTION_NAME" "{}"
85 changes: 85 additions & 0 deletions supabase/functions/lib/shared-functions/scrape-sitting-dates.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import { createSupabase } from "../utils/create-supabase.ts";
import { isAdmin } from "../utils/check-admin.ts";
import {
DOMParser,
Element,
HTMLDocument,
} from "https://deno.land/x/[email protected]/deno-dom-wasm.ts";
import buildResponseProxy from "../utils/build-response-proxy.ts";
import moment from "https://deno.land/x/[email protected]/mod.ts";
import { SupabaseClient } from "https://esm.sh/v135/@supabase/[email protected]/dist/module/index.d.ts";

async function getSittingDatesHtml() {
const url =
"https://www.parliament.gov.sg/parliamentary-business/votes-and-proceedings";
const pageSize = "40";
return (
await fetch(url, {
method: "POST",
body: new URLSearchParams({ PageSize: pageSize }),
})
).text();
}

function parseSittingDates(response: string) {
return new DOMParser().parseFromString(response, "text/html");
}

function scrapeRawStringFromContainer(container: Element) {
return container.querySelector(".xs-boxgap")!.textContent;
}

function parseRawStringToIsoDate(rawString: string) {
const rawDateString = rawString
.trim()
.match(/Sitting on (\d{1,2} \w+ \d{4})/gm)![0];
const date = moment(rawDateString, "D MMM YYYY");
return date.toISOString();
}

function scrapeFromParsedHtml(doc: HTMLDocument) {
const containers = doc.querySelectorAll(".indv-votes") as Iterable<Element>;
return Array.from(containers).map((container) => {
const rawString = scrapeRawStringFromContainer(container);
const sittingDate = parseRawStringToIsoDate(rawString);
return {
sitting_date: sittingDate,
};
});
}

async function uploadSittingDates(
supabase: SupabaseClient,
sittingDates: { sitting_date: string }[],
) {
const { error } = await supabase.from("sitting_date").upsert(sittingDates, {
onConflict: "sitting_date",
ignoreDuplicates: true,
});
if (error) throw error;
}

// Scrapes recent bills metadata
// Does not scrape the actual bill text and does not actually summarise them
export default async function scrapeSittingDates(req: Request) {
const supabase = createSupabase();

if (!isAdmin(req)) {
return buildResponseProxy({ message: "Unauthorised." }, 401);
}

console.log("Getting sitting dates HTML from URL...");
const response = await getSittingDatesHtml();

console.log("Parsing the downloaded HTML...");
console.log(response);
const doc = parseSittingDates(response);

console.log("Scraping and uploading relevant data...");
const sittingDates = scrapeFromParsedHtml(doc);
await uploadSittingDates(supabase, sittingDates);

return buildResponseProxy({
message: `Scraped and uploaded the following sitting dates, ignoring any duplication: ${JSON.stringify(sittingDates)}`,
});
}
2 changes: 2 additions & 0 deletions supabase/functions/per-1-hour/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ import { buildResponse } from "../lib/utils/build-response.ts";
import scrapeBillsIntroduced from "../lib/shared-functions/scrape-bills-introduced.ts";
import fetchMpData from "../lib/shared-functions/fetch-mp-data.ts";
import scrapeSittingReport from "../lib/shared-functions/scrape-sitting-report.ts";
import scrapeSittingDates from "../lib/shared-functions/scrape-sitting-dates.ts";

Deno.serve(async (req) => {
return buildResponse([
await fetchMpData(req),
await scrapeSittingDates(req),
await scrapeSittingReport(req),
await scrapeBillsIntroduced(req),
]);
Expand Down
5 changes: 5 additions & 0 deletions supabase/functions/scrape-sitting-dates/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "https://esm.sh/@supabase/functions-js/src/edge-runtime.d.ts";
import proxyToResponseWrapper from "../lib/utils/proxy-to-response-wrapper.ts";
import scrapeSittingDates from "../lib/shared-functions/scrape-sitting-dates.ts";

Deno.serve(proxyToResponseWrapper(scrapeSittingDates));

0 comments on commit a171595

Please sign in to comment.