-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
55346cf
commit 1f110a9
Showing
2 changed files
with
250 additions
and
0 deletions.
There are no files selected for viewing
245 changes: 245 additions & 0 deletions
245
supabase/functions/lib/shared-functions/scrape-sitting-report.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
import { createSupabase } from "../utils/create-supabase.ts"; | ||
import { isAdmin } from "../utils/check-admin.ts"; | ||
import { | ||
DOMParser, | ||
Element, | ||
} from "https://deno.land/x/[email protected]/deno-dom-wasm.ts"; | ||
import buildResponseProxy from "../utils/build-response-proxy.ts"; | ||
import { format } from "https://deno.land/[email protected]/datetime/mod.ts"; | ||
import { SupabaseClient } from "https://esm.sh/v135/@supabase/[email protected]/dist/module/index.d.ts"; | ||
|
||
async function insertSpeech( | ||
supabase: SupabaseClient, | ||
orderNo: number, | ||
debateId: number, | ||
content: string, | ||
) { | ||
const debateSpeechData = { | ||
order_no: orderNo, | ||
debate_id: debateId, | ||
content: content, | ||
}; | ||
const { error: insertDebateSpeechError } = await supabase | ||
.from("debate_speech") | ||
.insert(debateSpeechData); | ||
if (insertDebateSpeechError) throw insertDebateSpeechError; | ||
console.log(`Inserted debate speech ${JSON.stringify(debateSpeechData)}.`); | ||
} | ||
|
||
// Scrapes an entire sitting's report and fills out the database | ||
export default async function scrapeSittingReport(req: Request) { | ||
const supabase = createSupabase(); | ||
|
||
if (!isAdmin(req)) { | ||
return buildResponseProxy({ message: "Unauthorised." }, 401); | ||
} | ||
|
||
console.log("Getting sitting dates that need scraping..."); | ||
const { data: unscrapedDateData, error: unscrapedDateError } = await supabase | ||
.from("unscraped_sitting_dates_view") | ||
.select("id, sitting_date") | ||
.limit(1) | ||
.maybeSingle(); | ||
if (unscrapedDateError) throw unscrapedDateError; | ||
if (!unscrapedDateData) { | ||
return buildResponseProxy({ message: "No sitting dates need scraping." }); | ||
} | ||
console.log(`${unscrapedDateData.sitting_date} needs scraping.`); | ||
|
||
console.log("Getting sitting report JSON from URL..."); | ||
const sittingReportUrl = "https://sprs.parl.gov.sg/search/getHansardReport/"; | ||
const sittingReportFormattedDate = format( | ||
new Date(unscrapedDateData.sitting_date), | ||
"dd-MM-yyyy", | ||
); | ||
const sittingReportRequestParams = new URLSearchParams({ | ||
sittingDate: sittingReportFormattedDate, | ||
}); | ||
const sittingReportResponse = await fetch( | ||
sittingReportUrl + "?" + sittingReportRequestParams, | ||
); | ||
const sittingReport = await sittingReportResponse.json(); | ||
|
||
console.log("Scraping sitting report metadata..."); | ||
const parliamentNo = sittingReport.metadata.parlimentNO; | ||
const sessionNo = sittingReport.metadata.sessionNO; | ||
const volumeNo = sittingReport.metadata.volumeNO; | ||
const sittingNo = sittingReport.metadata.sittingNO; | ||
const sittingReportMetadata = { | ||
sitting_date_id: unscrapedDateData.id, | ||
parliament_no: parliamentNo, | ||
session_no: sessionNo, | ||
volume_no: volumeNo, | ||
sitting_no: sittingNo, | ||
}; | ||
const { data: sittingId, error: insertSittingMetadataError } = await supabase | ||
.from("sitting") | ||
.insert(sittingReportMetadata) | ||
.select("id") | ||
.single(); | ||
if (insertSittingMetadataError) throw insertSittingMetadataError; | ||
console.log( | ||
`Inserted sitting metadata ${JSON.stringify(sittingReportMetadata)} with ID ${sittingId.id}.`, | ||
); | ||
|
||
let debateOrderNo = 0; | ||
for (const sittingReportItem of sittingReport.takesSectionVOList) { | ||
console.log("Scraping debate..."); | ||
const debateTitle = sittingReportItem.title; | ||
// TODO: Add bill linkage | ||
const debateData = { | ||
order_no: debateOrderNo, | ||
sitting_id: sittingId.id, | ||
title: debateTitle, | ||
}; | ||
debateOrderNo++; | ||
|
||
const { data: debateId, error: insertDebateError } = await supabase | ||
.from("debate") | ||
.insert(debateData) | ||
.select("id") | ||
.single(); | ||
if (insertDebateError) throw insertDebateError; | ||
console.log( | ||
`Inserted debate ${JSON.stringify(debateData)} with ID ${debateId.id}.`, | ||
); | ||
|
||
console.log("Scraping debate speeches..."); | ||
const debateContentHtml = sittingReportItem.content; | ||
const debateContent = new DOMParser().parseFromString( | ||
debateContentHtml, | ||
"text/html", | ||
); | ||
const paragraphs = debateContent.querySelectorAll("p") as Iterable<Element>; | ||
|
||
// We build up the content buffer until a speaker's name is mentioned in the paragraph's <strong> tag | ||
// which we assume means that another speech has started. We then insert the content buffer into the database, which | ||
// crucially does not include the current paragraph (which is part of the next speech). | ||
// | ||
// We insert the content buffer one last time after the loop ends to ensure that the last speech is inserted. | ||
// Also, beware not to insert an empty contentBuffer (e.g. if the first paragraph has a speaker's name). | ||
let contentBuffer = ""; | ||
let debateSpeechOrderNo = 0; | ||
for (const paragraph of paragraphs) { | ||
const speaker = paragraph.querySelector("strong")?.textContent; | ||
const content = paragraph.textContent; | ||
if (speaker && contentBuffer.trim() != "") { | ||
console.log("Speaker name detected, inserting speech..."); | ||
await insertSpeech( | ||
supabase, | ||
debateSpeechOrderNo, | ||
debateId.id, | ||
contentBuffer, | ||
); | ||
debateSpeechOrderNo++; | ||
contentBuffer = ""; | ||
} | ||
|
||
// If there's nothing, then don't insert the paragraph break | ||
contentBuffer += (contentBuffer == "" ? "" : "\n\n") + content; | ||
} | ||
if (contentBuffer.trim() != "") { | ||
console.log("Inserting last speech..."); | ||
await insertSpeech( | ||
supabase, | ||
debateSpeechOrderNo, | ||
debateId.id, | ||
contentBuffer, | ||
); | ||
} | ||
} | ||
|
||
return buildResponseProxy({ message: "done" }); | ||
|
||
// | ||
// console.log("Parsing the downloaded HTML..."); | ||
// const billsIntroducedHtml = await billsIntroducedResponse.text(); | ||
// console.log(billsIntroducedHtml); | ||
// const billsIntroducedDoc = new DOMParser().parseFromString( | ||
// billsIntroducedHtml, | ||
// "text/html", | ||
// ); | ||
// | ||
// let addCount = 0; | ||
// let updateCount = 0; | ||
// | ||
// console.log("Scraping and uploading relevant data..."); | ||
// const billsIntroduced = billsIntroducedDoc.querySelectorAll( | ||
// ".indv-bill", | ||
// ) as Iterable<Element>; | ||
// for (const billIntroduced of billsIntroduced) { | ||
// const is_second_reading_next_available_seating = billIntroduced | ||
// .querySelector(".indv-bill .row:nth-of-type(2) div:nth-of-type(2)")! | ||
// .textContent.includes("Next Available Sitting"); | ||
// const passed_date: string | undefined = billIntroduced | ||
// .querySelector(".indv-bill .row:nth-of-type(2) div:nth-of-type(3)")! | ||
// .textContent.match(/(\d{2}\.\d{2}\.\d{4})/gm)?.[0]; | ||
// | ||
// const scrapedData = { | ||
// bill_no: billIntroduced | ||
// .querySelector(".indv-bill .bill-title div:nth-of-type(2)")! | ||
// .textContent.match(/(\d+\/\d{4})/gm)?.[0]!, | ||
// name: billIntroduced.querySelector("a")?.getAttribute("title")!, | ||
// date_introduced: toIsoDate( | ||
// billIntroduced | ||
// .querySelector(".indv-bill .row:nth-of-type(2) div:nth-of-type(1)")! | ||
// .textContent.match(/(\d{2}\.\d{2}\.\d{4})/gm)?.[0]!, | ||
// ), | ||
// second_reading_date_type: is_second_reading_next_available_seating | ||
// ? "next_available_seating" | ||
// : "explicit", | ||
// second_reading_date: is_second_reading_next_available_seating | ||
// ? null | ||
// : toIsoDate( | ||
// billIntroduced | ||
// .querySelector( | ||
// ".indv-bill .row:nth-of-type(2) div:nth-of-type(2)", | ||
// )! | ||
// .textContent.match(/(\d{2}\.\d{2}\.\d{4})/gm)?.[0]!, | ||
// ), | ||
// is_passed: passed_date !== undefined, | ||
// passed_date: passed_date ? toIsoDate(passed_date) : null, | ||
// pdf_url: sanitiseUrl( | ||
// billIntroduced.querySelector("a")!.getAttribute("href")!, | ||
// ), | ||
// original_text: null, | ||
// summary: null, | ||
// }; | ||
// | ||
// const billExists = | ||
// (await supabase | ||
// .from("bill") | ||
// .select("bill_no") | ||
// .eq("bill_no", scrapedData.bill_no) | ||
// .maybeSingle()) != null; | ||
// | ||
// if (billExists) { | ||
// // Make sure that we don't overwrite the original_text and summary values | ||
// const { | ||
// original_text: _original_text, | ||
// summary: _summary, | ||
// ...scrapedDataToUpdate | ||
// } = scrapedData; | ||
// const { error } = await supabase | ||
// .from("bill") | ||
// .update(scrapedDataToUpdate) | ||
// .eq("bill_no", scrapedData.bill_no); | ||
// if (error) throw error; | ||
// updateCount++; | ||
// console.info( | ||
// `Updated bill ${scrapedData.bill_no} with the following data: ${JSON.stringify(scrapedDataToUpdate)}`, | ||
// ); | ||
// } else { | ||
// const { error } = await supabase.from("bill").insert(scrapedData); | ||
// if (error) throw error; | ||
// addCount++; | ||
// console.info( | ||
// `Added bill ${scrapedData.bill_no} with the following data: ${JSON.stringify(scrapedData)}`, | ||
// ); | ||
// } | ||
// } | ||
// | ||
// return buildResponseProxy({ | ||
// message: `Added ${addCount} new bills and updated ${updateCount} existing bills.`, | ||
// }); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import "https://esm.sh/@supabase/functions-js/src/edge-runtime.d.ts"; | ||
import proxyToResponseWrapper from "../lib/utils/proxy-to-response-wrapper.ts"; | ||
import scrapeSittingReport from "../lib/shared-functions/scrape-sitting-report.ts"; | ||
|
||
Deno.serve(proxyToResponseWrapper(scrapeSittingReport)); |