Skip to content

Commit

Permalink
working date filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
JFriel committed Oct 9, 2023
1 parent 2bd16b2 commit 512436f
Showing 1 changed file with 15 additions and 12 deletions.
27 changes: 15 additions & 12 deletions Rdmp.Core/DataExport/DataExtraction/Pipeline/ExtractionHoldout.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using NPOI.SS.Formula.Functions;
using Org.BouncyCastle.Asn1.X509.Qualified;
using Rdmp.Core.CommandExecution;
using Rdmp.Core.CommandExecution.AtomicCommands.CatalogueCreationCommands;
using Rdmp.Core.Curation.Data;
Expand Down Expand Up @@ -47,8 +48,6 @@ public class ExtractionHoldout : IPluginDataFlowComponent<DataTable>, IPipelineR
public IExtractDatasetCommand Request { get; private set; }


private DataTable FilterableData { get; set; } //Rows that are valid as holdout data based on the user filters

private bool validateIfRowShouldBeFiltered(DataRow row)
{
if (!string.IsNullOrEmpty(dateColumn))
Expand Down Expand Up @@ -76,20 +75,25 @@ private bool validateIfRowShouldBeFiltered(DataRow row)
return true;
}

private DataTable filterRowsBasedOnHoldoutDates(DataTable toProcess)
private void filterRowsBasedOnHoldoutDates(DataTable toProcess)
{
DataTable filteredTable = toProcess.AsEnumerable().Where(row => validateIfRowShouldBeFiltered(row)).CopyToDataTable();
return filteredTable;
toProcess.Columns.Add("_isValidHoldout", typeof(bool));
foreach(DataRow row in toProcess.Rows)
{
row["_isValidHoldout"] = validateIfRowShouldBeFiltered(row);
}
// DataTable filteredTable = toProcess.AsEnumerable().Where(row => validateIfRowShouldBeFiltered(row)).CopyToDataTable();
// return filteredTable;
}

private int getHoldoutRowCount(DataTable toProcess, IDataLoadEventListener listener)
{

int rowCount = holdoutCount;
if (rowCount >= FilterableData.Rows.Count && !isPercentage)
if (rowCount >= toProcess.Rows.Count && !isPercentage)
{
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, "More holdout data was requested than there is available data. All valid data will be held back"));
rowCount = FilterableData.Rows.Count;
rowCount = toProcess.Rows.Count;
}
if (isPercentage)
{
Expand All @@ -98,7 +102,7 @@ private int getHoldoutRowCount(DataTable toProcess, IDataLoadEventListener liste
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, "Holdout percentage was >100%. Will use 100%"));
holdoutCount = 100;
}
rowCount = FilterableData.Rows.Count / 100 * holdoutCount;
rowCount = toProcess.Rows.Count / 100 * holdoutCount;
}
return rowCount;
}
Expand Down Expand Up @@ -140,10 +144,9 @@ public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener
{
return toProcess;
}
FilterableData = toProcess;
if (dateColumn is not null && (afterDate != DateTime.MinValue || beforeDate != DateTime.MinValue))
{
FilterableData = filterRowsBasedOnHoldoutDates(toProcess);
filterRowsBasedOnHoldoutDates(toProcess);
}

DataTable holdoutData = toProcess.Clone();
Expand All @@ -152,19 +155,19 @@ public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener
holdoutData.BeginLoadData();
toProcess.BeginLoadData();

var rowsToMove = FilterableData.AsEnumerable().OrderBy(r => rand.Next()).Take(holdoutCount);
var rowsToMove = toProcess.AsEnumerable().Where(row => row["_isValidHoldout"] is true).OrderBy(r => rand.Next()).Take(holdoutCount);
foreach (DataRow row in rowsToMove)
{
holdoutData.ImportRow(row);
toProcess.Rows.Remove(row);
//row.Delete();
}
holdoutData.EndLoadData();
toProcess.EndLoadData();
if (holdoutStorageLocation is not null && holdoutStorageLocation.Length > 0)
{
writeDataTabletoCSV(holdoutData);
}
toProcess.Columns.Remove("_isValidHoldout");
return toProcess;
}

Expand Down

0 comments on commit 512436f

Please sign in to comment.