Skip to content

Commit

Permalink
Added more strict code for recognizing PDF document types (#120)
Browse files Browse the repository at this point in the history
* - Improved detection of types of PDF documents

* - Added propagating reconstruction error
  • Loading branch information
jczaja authored Apr 30, 2024
1 parent f56db49 commit 4b19848
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 6 deletions.
Binary file added data/HowToReadETfromMSStatement.pdf
Binary file not shown.
107 changes: 105 additions & 2 deletions src/pdfparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use pdf::primitive::Primitive;

pub use crate::logging::ResultExt;

#[derive(Clone, Debug, PartialEq)]
enum StatementType {
UnknownDocument,
BrokerageStatement,
AccountStatement,
}
Expand Down Expand Up @@ -352,15 +354,44 @@ fn recognize_statement(page: PageRc) -> Result<StatementType, String> {
.as_ref()
.ok_or("Unable to get content of first PDF page")?;

let mut statement_type = StatementType::BrokerageStatement;
let mut statement_type = StatementType::UnknownDocument;
contents.operations.iter().try_for_each(|op| {
log::trace!("Detected PDF command: {}",op.operator);
match op.operator.as_ref() {
"TJ" => {
// Text show
if op.operands.len() > 0 {
//transaction_date = op.operands[0];
let a = &op.operands[0];
log::trace!("Detected PDF text object: {a}");
match a {
Primitive::Array(c) => {
for e in c {
if let Primitive::String(actual_string) = e {
let raw_string = actual_string.clone().into_string();
let rust_string = if let Ok(r) = raw_string {
r.trim().to_uppercase()
} else {
"".to_owned()
};
if rust_string.contains("ACCT:") {
statement_type = StatementType::BrokerageStatement;
log::info!("PDF parser recognized Brokerage Statement document by finding: \"{rust_string}\"");
return Ok(());
}
}
}
}
_ => (),
}
}
},
"Tj" => {
// Text show
if op.operands.len() > 0 {
//transaction_date = op.operands[0];
let a = &op.operands[0];
log::info!("Detected PDF object: {a}");
log::info!("Detected PDF text object: {a}");
match a {
Primitive::String(actual_string) => {
let raw_string = actual_string.clone().into_string();
Expand Down Expand Up @@ -892,6 +923,10 @@ pub fn parse_statement(

let (interests_transactions, div_transactions, sold_transactions, trades) = match document_type
{
StatementType::UnknownDocument => {
log::info!("Processing unknown document PDF");
return Err(format!("Unsupported PDF document type: {pdftoparse}"));
}
StatementType::BrokerageStatement => {
log::info!("Processing brokerage statement PDF");
parse_brokerage_statement(pdffile_iter)?
Expand Down Expand Up @@ -1090,6 +1125,74 @@ mod tests {
Ok(())
}

#[test]
#[ignore]
fn test_recognize_document_type_ms() -> Result<(), String> {
let pdftoparse = "etrade_data_2023/MS_ClientStatements_6557_202309.pdf";

//2. parsing each pdf
let mypdffile = File::<Vec<u8>>::open(pdftoparse)
.map_err(|_| format!("Error opening and parsing file: {}", pdftoparse))?;

let mut pdffile_iter = mypdffile.pages();

let first_page = pdffile_iter
.next()
.unwrap()
.map_err(|_| "Unable to get first page of PDF file".to_string())?;

let document_type = recognize_statement(first_page)?;

assert_eq!(document_type, StatementType::AccountStatement);

Ok(())
}

#[test]
#[ignore]
fn test_recognize_document_type_bs() -> Result<(), String> {
let pdftoparse = "etrade_data_2023/Brokerage Statement - XXXXX6557 - 202302.pdf";

//2. parsing each pdf
let mypdffile = File::<Vec<u8>>::open(pdftoparse)
.map_err(|_| format!("Error opening and parsing file: {}", pdftoparse))?;

let mut pdffile_iter = mypdffile.pages();

let first_page = pdffile_iter
.next()
.unwrap()
.map_err(|_| "Unable to get first page of PDF file".to_string())?;

let document_type = recognize_statement(first_page)?;

assert_eq!(document_type, StatementType::BrokerageStatement);

Ok(())
}

#[test]
fn test_recognize_document_type_unk() -> Result<(), String> {
let pdftoparse = "data/HowToReadETfromMSStatement.pdf";

//2. parsing each pdf
let mypdffile = File::<Vec<u8>>::open(pdftoparse)
.map_err(|_| format!("Error opening and parsing file: {}", pdftoparse))?;

let mut pdffile_iter = mypdffile.pages();

let first_page = pdffile_iter
.next()
.unwrap()
.map_err(|_| "Unable to get first page of PDF file".to_string())?;

let document_type = recognize_statement(first_page)?;

assert_eq!(document_type, StatementType::UnknownDocument);

Ok(())
}

#[test]
#[ignore]
fn test_account_statement() -> Result<(), String> {
Expand Down
11 changes: 7 additions & 4 deletions src/transactions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ pub fn reconstruct_sold_transactions(
log::info!("Candidate Sold transaction from PDF: trade_date: {trade_dt} income: {income}");
let trade_date_pdf = chrono::NaiveDate::parse_from_str(&trade_dt, "%m/%d/%y").expect_and_log(&format!("Unable to parse trade date: {trade_dt}"));
trade_date == trade_date_pdf
}).expect_and_log(&format!("\n\nERROR: Sold transaction in Gain&Losses:\n (trade_date: {tr_date}, acquisition date: {acquisition_date}, cost basis: {cost_basis}, income: {inc}) exist,\n but corressponding data from PDF document is missing. You can download account statements PDF documents at:\n
https://edoc.etrade.com/e/t/onlinedocs/docsearch?doc_type=stmt\n\n"));
}).ok_or(format!("\n\nERROR: Sold transaction in Gain&Losses:\n (trade_date: {tr_date}, acquisition date: {acquisition_date}, cost basis: {cost_basis}, income: {inc}) exist,\n but corressponding data from PDF document is missing. You can download account statements PDF documents at:\n
https://edoc.etrade.com/e/t/onlinedocs/docsearch?doc_type=stmt\n\n"))?;

detailed_sold_transactions.push((
chrono::NaiveDate::parse_from_str(&tr_date, "%m/%d/%Y")
Expand Down Expand Up @@ -698,7 +698,6 @@ mod tests {
}

#[test]
#[should_panic]
fn test_sold_transaction_reconstruction_second_fail() {
let parsed_sold_transactions: Vec<(String, String, f32, f32, f32)> = vec![(
"11/07/22".to_string(), // trade date
Expand Down Expand Up @@ -732,7 +731,11 @@ mod tests {
),
];

let _ = reconstruct_sold_transactions(&parsed_sold_transactions, &parsed_gains_and_losses);
assert_eq!(
reconstruct_sold_transactions(&parsed_sold_transactions, &parsed_gains_and_losses)
.is_ok(),
false
);
}

#[test]
Expand Down

0 comments on commit 4b19848

Please sign in to comment.