Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added more strict code for recognizing PDF document types #120

Merged
merged 2 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data/HowToReadETfromMSStatement.pdf
Binary file not shown.
107 changes: 105 additions & 2 deletions src/pdfparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

pub use crate::logging::ResultExt;

#[derive(Clone, Debug, PartialEq)]
enum StatementType {
UnknownDocument,
BrokerageStatement,
AccountStatement,
}
Expand Down Expand Up @@ -352,15 +354,44 @@
.as_ref()
.ok_or("Unable to get content of first PDF page")?;

let mut statement_type = StatementType::BrokerageStatement;
let mut statement_type = StatementType::UnknownDocument;
contents.operations.iter().try_for_each(|op| {
log::trace!("Detected PDF command: {}",op.operator);
match op.operator.as_ref() {
"TJ" => {
// Text show
if op.operands.len() > 0 {
//transaction_date = op.operands[0];
let a = &op.operands[0];
log::trace!("Detected PDF text object: {a}");
match a {
Primitive::Array(c) => {
for e in c {
if let Primitive::String(actual_string) = e {
let raw_string = actual_string.clone().into_string();
let rust_string = if let Ok(r) = raw_string {
r.trim().to_uppercase()
} else {
"".to_owned()
};
if rust_string.contains("ACCT:") {
statement_type = StatementType::BrokerageStatement;
log::info!("PDF parser recognized Brokerage Statement document by finding: \"{rust_string}\"");
return Ok(());
}
}
}
}
_ => (),
}
}
},
"Tj" => {
// Text show
if op.operands.len() > 0 {
//transaction_date = op.operands[0];
let a = &op.operands[0];
log::info!("Detected PDF object: {a}");
log::info!("Detected PDF text object: {a}");
match a {
Primitive::String(actual_string) => {
let raw_string = actual_string.clone().into_string();
Expand Down Expand Up @@ -399,7 +430,7 @@
sequence: &mut std::collections::VecDeque<Box<dyn Entry>>,
transaction_type: TransactionType,
) -> Result<ParserState, String> {
let mut state = ParserState::ProcessingTransaction(transaction_type.clone());

Check warning on line 433 in src/pdfparser.rs

View workflow job for this annotation

GitHub Actions / coverage

value assigned to `state` is never read
let possible_obj = sequence.pop_front();
match possible_obj {
// Move executed parser objects into Vector
Expand Down Expand Up @@ -892,6 +923,10 @@

let (interests_transactions, div_transactions, sold_transactions, trades) = match document_type
{
StatementType::UnknownDocument => {
log::info!("Processing unknown document PDF");
return Err(format!("Unsupported PDF document type: {pdftoparse}"));
}
StatementType::BrokerageStatement => {
log::info!("Processing brokerage statement PDF");
parse_brokerage_statement(pdffile_iter)?
Expand Down Expand Up @@ -1090,6 +1125,74 @@
Ok(())
}

#[test]
#[ignore]
fn test_recognize_document_type_ms() -> Result<(), String> {
let pdftoparse = "etrade_data_2023/MS_ClientStatements_6557_202309.pdf";

//2. parsing each pdf
let mypdffile = File::<Vec<u8>>::open(pdftoparse)
.map_err(|_| format!("Error opening and parsing file: {}", pdftoparse))?;

let mut pdffile_iter = mypdffile.pages();

let first_page = pdffile_iter
.next()
.unwrap()
.map_err(|_| "Unable to get first page of PDF file".to_string())?;

let document_type = recognize_statement(first_page)?;

assert_eq!(document_type, StatementType::AccountStatement);

Ok(())
}

#[test]
#[ignore]
fn test_recognize_document_type_bs() -> Result<(), String> {
let pdftoparse = "etrade_data_2023/Brokerage Statement - XXXXX6557 - 202302.pdf";

//2. parsing each pdf
let mypdffile = File::<Vec<u8>>::open(pdftoparse)
.map_err(|_| format!("Error opening and parsing file: {}", pdftoparse))?;

let mut pdffile_iter = mypdffile.pages();

let first_page = pdffile_iter
.next()
.unwrap()
.map_err(|_| "Unable to get first page of PDF file".to_string())?;

let document_type = recognize_statement(first_page)?;

assert_eq!(document_type, StatementType::BrokerageStatement);

Ok(())
}

#[test]
fn test_recognize_document_type_unk() -> Result<(), String> {
let pdftoparse = "data/HowToReadETfromMSStatement.pdf";

//2. parsing each pdf
let mypdffile = File::<Vec<u8>>::open(pdftoparse)
.map_err(|_| format!("Error opening and parsing file: {}", pdftoparse))?;

let mut pdffile_iter = mypdffile.pages();

let first_page = pdffile_iter
.next()
.unwrap()
.map_err(|_| "Unable to get first page of PDF file".to_string())?;

let document_type = recognize_statement(first_page)?;

assert_eq!(document_type, StatementType::UnknownDocument);

Ok(())
}

#[test]
#[ignore]
fn test_account_statement() -> Result<(), String> {
Expand Down
11 changes: 7 additions & 4 deletions src/transactions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ pub fn reconstruct_sold_transactions(
log::info!("Candidate Sold transaction from PDF: trade_date: {trade_dt} income: {income}");
let trade_date_pdf = chrono::NaiveDate::parse_from_str(&trade_dt, "%m/%d/%y").expect_and_log(&format!("Unable to parse trade date: {trade_dt}"));
trade_date == trade_date_pdf
}).expect_and_log(&format!("\n\nERROR: Sold transaction in Gain&Losses:\n (trade_date: {tr_date}, acquisition date: {acquisition_date}, cost basis: {cost_basis}, income: {inc}) exist,\n but corressponding data from PDF document is missing. You can download account statements PDF documents at:\n
https://edoc.etrade.com/e/t/onlinedocs/docsearch?doc_type=stmt\n\n"));
}).ok_or(format!("\n\nERROR: Sold transaction in Gain&Losses:\n (trade_date: {tr_date}, acquisition date: {acquisition_date}, cost basis: {cost_basis}, income: {inc}) exist,\n but corressponding data from PDF document is missing. You can download account statements PDF documents at:\n
https://edoc.etrade.com/e/t/onlinedocs/docsearch?doc_type=stmt\n\n"))?;

detailed_sold_transactions.push((
chrono::NaiveDate::parse_from_str(&tr_date, "%m/%d/%Y")
Expand Down Expand Up @@ -698,7 +698,6 @@ mod tests {
}

#[test]
#[should_panic]
fn test_sold_transaction_reconstruction_second_fail() {
let parsed_sold_transactions: Vec<(String, String, f32, f32, f32)> = vec![(
"11/07/22".to_string(), // trade date
Expand Down Expand Up @@ -732,7 +731,11 @@ mod tests {
),
];

let _ = reconstruct_sold_transactions(&parsed_sold_transactions, &parsed_gains_and_losses);
assert_eq!(
reconstruct_sold_transactions(&parsed_sold_transactions, &parsed_gains_and_losses)
.is_ok(),
false
);
}

#[test]
Expand Down
Loading