From fa4194fa058537f501890117690c5b105b24f477 Mon Sep 17 00:00:00 2001 From: Lim Ding Wen Date: Fri, 26 Jul 2024 22:13:06 +0800 Subject: [PATCH] Add more descriptive return message to Scrape Sitting Report Also removes giant commented code --- .../shared-functions/scrape-sitting-report.ts | 104 ++---------------- 1 file changed, 10 insertions(+), 94 deletions(-) diff --git a/supabase/functions/lib/shared-functions/scrape-sitting-report.ts b/supabase/functions/lib/shared-functions/scrape-sitting-report.ts index fa903d5..75aa2f5 100644 --- a/supabase/functions/lib/shared-functions/scrape-sitting-report.ts +++ b/supabase/functions/lib/shared-functions/scrape-sitting-report.ts @@ -82,6 +82,9 @@ export default async function scrapeSittingReport(req: Request) { `Inserted sitting metadata ${JSON.stringify(sittingReportMetadata)} with ID ${sittingId.id}.`, ); + let debateCount = 0; + let debateSpeechCount = 0; + let debateOrderNo = 0; for (const sittingReportItem of sittingReport.takesSectionVOList) { console.log("Scraping debate..."); @@ -103,6 +106,7 @@ export default async function scrapeSittingReport(req: Request) { console.log( `Inserted debate ${JSON.stringify(debateData)} with ID ${debateId.id}.`, ); + debateCount++; console.log("Scraping debate speeches..."); const debateContentHtml = sittingReportItem.content; @@ -131,7 +135,8 @@ export default async function scrapeSittingReport(req: Request) { debateId.id, contentBuffer, ); - debateSpeechOrderNo++; + debateSpeechCount++; + contentBuffer = ""; } @@ -146,100 +151,11 @@ export default async function scrapeSittingReport(req: Request) { debateId.id, contentBuffer, ); + debateSpeechCount++; } } - return buildResponseProxy({ message: "done" }); - - // - // console.log("Parsing the downloaded HTML..."); - // const billsIntroducedHtml = await billsIntroducedResponse.text(); - // console.log(billsIntroducedHtml); - // const billsIntroducedDoc = new DOMParser().parseFromString( - // billsIntroducedHtml, - // "text/html", - // ); - // - // let addCount = 0; - // let updateCount = 0; - // - // console.log("Scraping and uploading relevant data..."); - // const billsIntroduced = billsIntroducedDoc.querySelectorAll( - // ".indv-bill", - // ) as Iterable; - // for (const billIntroduced of billsIntroduced) { - // const is_second_reading_next_available_seating = billIntroduced - // .querySelector(".indv-bill .row:nth-of-type(2) div:nth-of-type(2)")! - // .textContent.includes("Next Available Sitting"); - // const passed_date: string | undefined = billIntroduced - // .querySelector(".indv-bill .row:nth-of-type(2) div:nth-of-type(3)")! - // .textContent.match(/(\d{2}\.\d{2}\.\d{4})/gm)?.[0]; - // - // const scrapedData = { - // bill_no: billIntroduced - // .querySelector(".indv-bill .bill-title div:nth-of-type(2)")! - // .textContent.match(/(\d+\/\d{4})/gm)?.[0]!, - // name: billIntroduced.querySelector("a")?.getAttribute("title")!, - // date_introduced: toIsoDate( - // billIntroduced - // .querySelector(".indv-bill .row:nth-of-type(2) div:nth-of-type(1)")! - // .textContent.match(/(\d{2}\.\d{2}\.\d{4})/gm)?.[0]!, - // ), - // second_reading_date_type: is_second_reading_next_available_seating - // ? "next_available_seating" - // : "explicit", - // second_reading_date: is_second_reading_next_available_seating - // ? null - // : toIsoDate( - // billIntroduced - // .querySelector( - // ".indv-bill .row:nth-of-type(2) div:nth-of-type(2)", - // )! - // .textContent.match(/(\d{2}\.\d{2}\.\d{4})/gm)?.[0]!, - // ), - // is_passed: passed_date !== undefined, - // passed_date: passed_date ? toIsoDate(passed_date) : null, - // pdf_url: sanitiseUrl( - // billIntroduced.querySelector("a")!.getAttribute("href")!, - // ), - // original_text: null, - // summary: null, - // }; - // - // const billExists = - // (await supabase - // .from("bill") - // .select("bill_no") - // .eq("bill_no", scrapedData.bill_no) - // .maybeSingle()) != null; - // - // if (billExists) { - // // Make sure that we don't overwrite the original_text and summary values - // const { - // original_text: _original_text, - // summary: _summary, - // ...scrapedDataToUpdate - // } = scrapedData; - // const { error } = await supabase - // .from("bill") - // .update(scrapedDataToUpdate) - // .eq("bill_no", scrapedData.bill_no); - // if (error) throw error; - // updateCount++; - // console.info( - // `Updated bill ${scrapedData.bill_no} with the following data: ${JSON.stringify(scrapedDataToUpdate)}`, - // ); - // } else { - // const { error } = await supabase.from("bill").insert(scrapedData); - // if (error) throw error; - // addCount++; - // console.info( - // `Added bill ${scrapedData.bill_no} with the following data: ${JSON.stringify(scrapedData)}`, - // ); - // } - // } - // - // return buildResponseProxy({ - // message: `Added ${addCount} new bills and updated ${updateCount} existing bills.`, - // }); + return buildResponseProxy({ + message: `Inserted sitting ${JSON.stringify(sittingReportMetadata)} with ${debateCount} debates and ${debateSpeechCount} speeches.`, + }); }