From d09ea0070db809b42f6b71e4a21622a5ecde86ba Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Fri, 18 Oct 2024 13:18:01 +0200 Subject: [PATCH 01/37] Basic structure --- sources/RunValidation/Helper.cs | 310 ++++++++++++++ sources/RunValidation/Program.cs | 10 + sources/RunValidation/RunValidation.csproj | 14 + sources/RunValidation/Validation1.cs | 103 +++++ sources/RunValidation/Validation2.cs | 448 +++++++++++++++++++++ sources/org.ohdsi.cdm.sln | 29 +- 6 files changed, 913 insertions(+), 1 deletion(-) create mode 100644 sources/RunValidation/Helper.cs create mode 100644 sources/RunValidation/Program.cs create mode 100644 sources/RunValidation/RunValidation.csproj create mode 100644 sources/RunValidation/Validation1.cs create mode 100644 sources/RunValidation/Validation2.cs diff --git a/sources/RunValidation/Helper.cs b/sources/RunValidation/Helper.cs new file mode 100644 index 00000000..84993e1a --- /dev/null +++ b/sources/RunValidation/Helper.cs @@ -0,0 +1,310 @@ +using Amazon.S3; +using Amazon.S3.Model; +using Amazon.S3.Transfer; +using CsvHelper.Configuration; +using CsvHelper; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.IO.Compression; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using org.ohdsi.cdm.framework.common.Enums; + +namespace RunValidation +{ + + internal class Helper + { + internal static void CheckChunk(string localTmpPath, List objects, string awsAccessKeyId, string awsSecretAccessKey, string bucket, + KeyValuePair>> chunk) + { + + var missed = 0; + var dups = 0; + + var attempt = 0; + var complete = false; + + var config = new AmazonS3Config + { + Timeout = TimeSpan.FromMinutes(60), + RegionEndpoint = Amazon.RegionEndpoint.USEast1, + BufferSize = 1512 * 1024, + MaxErrorRetry = 120 + }; + + while (!complete) + { + try + { + attempt++; + + var timer = new Stopwatch(); + timer.Start(); + + var cnt = 0; + var attempt1 = attempt; + + Parallel.ForEach(objects, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, o => + { + var loadAttempt = 0; + var loaded = false; + while (!loaded) + { + try + { + loadAttempt++; + using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, config)) + using (var transferUtility = new TransferUtility(client)) + { + transferUtility.Download($@"{localTmpPath}\{o.Key}", bucket, o.Key); + } + loaded = true; + } + catch (Exception) + { + if (loadAttempt <= 11) + { + Console.WriteLine(o.Key + " | " + loadAttempt); + } + else + { + throw; + } + } + } + + + using (var responseStream = File.Open($@"{localTmpPath}\{o.Key}", FileMode.Open)) + using (var bufferedStream = new BufferedStream(responseStream)) + using (var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress)) + using (var reader = new StreamReader(gzipStream, Encoding.Default)) + { + using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = false, + Delimiter = ",", + Encoding = Encoding.UTF8 + }); + while (csv.Read()) + { + var personId = (long)csv.GetField(typeof(long), 0); + + chunk.Value[personId].Add(o.Key); + } + + Interlocked.Increment(ref cnt); + + Console.Write( + $"\rchunkId={chunk.Key} | {cnt} from {objects.Count} | attempt={attempt1}"); + } + + File.Delete($@"{localTmpPath}\{o.Key}"); + + }); + + + foreach (var ci in chunk.Value) + { + if (ci.Value.Count == 0) + { + missed++; + } + else if (ci.Value.Count > 1) + { + dups++; + } + } + + timer.Stop(); + Console.WriteLine($" | DONE | missed={missed}; dups={dups} | total={timer.ElapsedMilliseconds}ms"); + + complete = true; + } + catch (Exception) + { + Console.Write(" | Exception"); + if (attempt > 3) + { + throw; + } + } + } + } + + internal static IEnumerable> GetObjectsFromS3(Vendor vendor, int buildingId, string awsAccessKeyId, string awsSecretAccessKey, + string bucket, string cdmFolder, string table, int chunkId, int slicesNum) + { + for (int i = 0; i < slicesNum; i++) + { + var prefix = $"{vendor}/{buildingId}/{cdmFolder}/{table}/{table}.{i}.{chunkId}."; + + using var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, + Amazon.RegionEndpoint.USEast1); + var request = new ListObjectsV2Request + { + BucketName = bucket, + Prefix = prefix + }; + ListObjectsV2Response response; + do + { + var responseTask = client.ListObjectsV2Async(request); + responseTask.Wait(); + response = responseTask.Result; + + yield return response.S3Objects; + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated); + } + + } + + internal static List FindSlicesByPersonIds(string awsAccessKeyId, string awsSecretAccessKey, string bucket, Vendor vendor, int buildingId, int chunkId, string table, Dictionary personIds, int personIndex) + { + var prefix = $"{vendor}/{buildingId}/raw/{chunkId}/{table}/{table}"; + + var input = new ConcurrentDictionary(); + + foreach (var pId in personIds.Keys) + { + input.TryAdd(pId, false); + } + + var result = new ConcurrentDictionary(); + using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, + Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = bucket, + Prefix = prefix + }; + + var r = client.ListObjectsV2Async(request); + r.Wait(); + var response = r.Result; + var rows = new List(); + + Parallel.ForEach(response.S3Objects, o => + { + using var transferUtility = new TransferUtility(awsAccessKeyId, awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(bucket, o.Key); + using var bufferedStream = new BufferedStream(responseStream); + using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); + using var reader = new StreamReader(gzipStream, Encoding.Default); + string line; + while ((line = reader.ReadLine()) != null) + { + if (input.IsEmpty) + break; + + if (!string.IsNullOrEmpty(line)) + { + long personId = long.Parse(line.Split('\t')[personIndex]); + if (personIds.ContainsKey(personId)) + { + result.TryAdd(o.Key, false); + input.TryRemove(personId, out var res); + break; + } + } + } + }); + } + + return [.. result.Keys]; + } + + internal static IEnumerable>>> GetChunksFromS3(string localTmpPath, Vendor vendor, int buildingId, + string awsAccessKeyId, string awsSecretAccessKey, + string bucket) + { + var currentChunkId = 0; + var result = new KeyValuePair>>(0, []); + var prefix = $"{vendor}/{buildingId}/_chunks"; + using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, + Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = bucket, + Prefix = prefix + }; + + var response = client.ListObjectsV2Async(request); + response.Wait(); + + foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) + { + var loadAttempt = 0; + var loaded = false; + while (!loaded) + { + try + { + loadAttempt++; + using (var transferUtility = new TransferUtility(client)) + { + transferUtility.Download($@"{localTmpPath}\{o.Key}", bucket, o.Key); + } + loaded = true; + } + catch (Exception) + { + if (loadAttempt <= 11) + { + Console.WriteLine(o.Key + " | " + loadAttempt); + } + else + { + throw; + } + } + } + + using (var responseStream = File.Open($@"{localTmpPath}\{o.Key}", FileMode.Open)) + using (var bufferedStream = new BufferedStream(responseStream)) + using (var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress)) + using (var reader = new StreamReader(gzipStream, Encoding.Default)) + { + string line; + while ((line = reader.ReadLine()) != null) + { + if (!string.IsNullOrEmpty(line)) + { + var chunkId = int.Parse(line.Split('\t')[0]); + + if (currentChunkId != chunkId) + { + if (result.Value.Count > 0) + yield return result; + + result = new KeyValuePair>>(chunkId, + []); + currentChunkId = chunkId; + } + + var personId = long.Parse(line.Split('\t')[1]); + result.Value.Add(personId, []); + } + } + } + + File.Delete($@"{localTmpPath}\{o.Key}"); + } + } + + if (result.Value.Count > 0) + yield return result; + } + } + +} \ No newline at end of file diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs new file mode 100644 index 00000000..e48a3e7d --- /dev/null +++ b/sources/RunValidation/Program.cs @@ -0,0 +1,10 @@ +namespace RunValidation +{ + internal class Program + { + static void Main(string[] args) + { + Console.WriteLine("Hello, World!"); + } + } +} diff --git a/sources/RunValidation/RunValidation.csproj b/sources/RunValidation/RunValidation.csproj new file mode 100644 index 00000000..1adb2faa --- /dev/null +++ b/sources/RunValidation/RunValidation.csproj @@ -0,0 +1,14 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + diff --git a/sources/RunValidation/Validation1.cs b/sources/RunValidation/Validation1.cs new file mode 100644 index 00000000..83e8a307 --- /dev/null +++ b/sources/RunValidation/Validation1.cs @@ -0,0 +1,103 @@ +using Amazon.S3.Model; +using org.ohdsi.cdm.framework.common.Enums; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Numerics; + +namespace RunValidation +{ + + public class Validation1(string awsAccessKeyId, string awsSecretAccessKey, string bucket, string tmpFolder) + { + private readonly string _awsAccessKeyId = awsAccessKeyId; + private readonly string _awsSecretAccessKey = awsSecretAccessKey; + private readonly string _bucket = bucket; + private readonly string _tmpFolder = tmpFolder; + private string _cdmFolder; + + public void Start(Vendor vendor, int buildingId, int slicesNum, string cdmFolder) + { + _cdmFolder = cdmFolder; + + Console.WriteLine($"{vendor}.{buildingId}"); + List wrong = []; + HashSet missed = []; + var cIds = new HashSet(); + var pIds = new HashSet(); + var f = new HashSet(); + var s = new HashSet(); + var timer = new Stopwatch(); + timer.Start(); + + foreach (var chunk in Helper.GetChunksFromS3(_tmpFolder, vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket)) + { + var chunkId = chunk.Key; + + var objects = new List(); + foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, + _cdmFolder, "PERSON", chunkId, slicesNum)) + { + objects.AddRange(o); + } + + foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, + _cdmFolder, "METADATA_TMP", chunkId, slicesNum)) + { + objects.AddRange(o); + } + + if (objects.Count == 0) + { + wrong.Add($"chunkId={chunkId} - MISSED"); + } + + Helper.CheckChunk(_tmpFolder, objects, _awsAccessKeyId, _awsSecretAccessKey, _bucket, chunk); + + int missedCnt = 0; + var missedPersonIds = new Dictionary(); + + foreach (var c in chunk.Value) + { + if (c.Value.Count != 1) + { + missedCnt++; + + if (missedCnt == 1 || missedCnt % 500 == 0) + missedPersonIds.Add(c.Key, false); + } + + if (c.Value.Count != 1) + { + wrong.Add($"chunkId={chunkId};person_id={c.Key};filese={string.Join(',', [.. c.Value])}"); + cIds.Add(chunkId); + pIds.Add(c.Key); + + foreach (var v in c.Value) + { + f.Add(v); + s.Add($@"done.Add(Process(vendor, buildingId, {chunkId}, ""{Int32.Parse(v.Split('/')[4].Split('.')[1]):0000}"", true));"); + } + } + } + + if (missedPersonIds.Count > 0) + { + foreach (var r in Helper.FindSlicesByPersonIds(_awsAccessKeyId, _awsSecretAccessKey, _bucket, vendor, buildingId, chunkId, vendor.PersonTableName, missedPersonIds, vendor.PersonIdIndex)) + { + missed.Add(r); + var fileName = r.Replace(@"\", "_").Replace(@"/", "_"); + File.Create($@"{_tmpFolder}\{fileName}.txt").Dispose(); + Console.WriteLine(fileName); + } + } + } + + Console.WriteLine(); + timer.Stop(); + Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); + timer.Restart(); + } + } +} \ No newline at end of file diff --git a/sources/RunValidation/Validation2.cs b/sources/RunValidation/Validation2.cs new file mode 100644 index 00000000..2cd32cb5 --- /dev/null +++ b/sources/RunValidation/Validation2.cs @@ -0,0 +1,448 @@ +using Amazon.S3; +using Amazon.S3.Model; +using Amazon.S3.Transfer; +using CsvHelper; +using CsvHelper.Configuration; +using org.ohdsi.cdm.framework.common.Helpers; +using org.ohdsi.cdm.framework.desktop.Settings; +using System.Diagnostics; +using System.Globalization; +using System.IO.Compression; +using System.Text; + +namespace RunValidation +{ + public class Validation2 + { + private string _cdmFolder = "cdmCSV"; + private LambdaUtility _lambdaUtility; + + public void Start(LambdaUtility utility, string cdmCsvFolder) + { + var wrong = new List(); + var timer = new Stopwatch(); + timer.Start(); + _cdmFolder = cdmCsvFolder; + + var slicesNum = GetSlicesNum(); + _lambdaUtility = utility; + + foreach (var chunk in GetChunk()) + { + var chunkId = chunk.Key; + var objects = new List(); + foreach (var o in GetObjects("PERSON", chunkId, slicesNum)) + { + objects.AddRange(o); + } + + foreach (var o in GetObjects("METADATA_TMP", chunkId, slicesNum)) + { + objects.AddRange(o); + } + + if (objects.Count == 0) + { + wrong.Add($"chunkId={chunkId} - MISSED"); + } + + ProcessChunk(objects, chunk, slicesNum, false); + + foreach (var c in chunk.Value) + { + if (c.Value.Count != 1) + wrong.Add($"chunkId={chunkId};person_id={c.Key};files={string.Join(',', [.. c.Value])}"); + } + } + + Console.WriteLine(); + timer.Stop(); + Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); + timer.Restart(); + } + + private int GetSlicesNum() + { + var slices = new HashSet(); + var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/{_cdmFolder}/PERSON/PERSON."; + Console.WriteLine("Calculating slices num " + Settings.Current.Bucket + "|" + prefix); + using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, + Amazon.RegionEndpoint.USEast1)) + + { + var request = new ListObjectsV2Request + { + BucketName = Settings.Current.Bucket, + Prefix = prefix + }; + ListObjectsV2Response response; + + do + { + var responseTask = client.ListObjectsV2Async(request); + responseTask.Wait(); + response = responseTask.Result; + + foreach (var o in response.S3Objects) + { + slices.Add(o.Key.Split('.')[1]); + } + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated); + } + + Console.WriteLine("slices.Count=" + slices.Count); + + return slices.Count; + } + + private static IEnumerable>>> GetChunk() + { + var currentChunkId = 0; + var result = new KeyValuePair>>(0, []); + var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/_chunks"; + using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, + Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = Settings.Current.Bucket, + Prefix = prefix + }; + + var response = client.ListObjectsV2Async(request); + response.Wait(); + + foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) + { + using var transferUtility = new TransferUtility(Settings.Current.S3AwsAccessKeyId, + Settings.Current.S3AwsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(Settings.Current.Bucket, o.Key); + using var bufferedStream = new BufferedStream(responseStream); + using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); + using var reader = new StreamReader(gzipStream, Encoding.Default); + string line; + while ((line = reader.ReadLine()) != null) + { + if (!string.IsNullOrEmpty(line)) + { + var chunkId = int.Parse(line.Split('\t')[0]); + + if (currentChunkId != chunkId) + { + if (result.Value.Count > 0) + yield return result; + + result = new KeyValuePair>>(chunkId, + []); + currentChunkId = chunkId; + } + + var personId = long.Parse(line.Split('\t')[1]); + result.Value.Add(personId, []); + } + } + } + } + + if (result.Value.Count > 0) + yield return result; + } + + public IEnumerable> GetObjects(string table, int chunkId, int slicesNum) + { + for (int i = 0; i < slicesNum; i++) + { + var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/{_cdmFolder}/{table}/{table}.{i}.{chunkId}."; + using var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, + Amazon.RegionEndpoint.USEast1); + var request = new ListObjectsV2Request + { + BucketName = Settings.Current.Bucket, + Prefix = prefix + }; + ListObjectsV2Response response; + do + { + var responseTask = client.ListObjectsV2Async(request); + responseTask.Wait(); + response = responseTask.Result; + + yield return response.S3Objects; + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated); + } + + } + + public Dictionary ProcessChunk(List objects, KeyValuePair>> chunk, int slicesNum, bool onlyCheck) + { + var attempt = 0; + var complete = false; + + while (!complete) + { + try + { + attempt++; + var missed = 0; + var dups = 0; + + foreach (var ci in chunk.Value) + { + ci.Value.Clear(); + } + + var timer = new Stopwatch(); + timer.Start(); + + var cnt = 0; + var attempt1 = attempt; + + Parallel.ForEach(objects, o => + { + using var transferUtility = new TransferUtility(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, + Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(Settings.Current.Bucket, o.Key); + using var bufferedStream = new BufferedStream(responseStream); + using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); + using var reader = new StreamReader(gzipStream, Encoding.Default); + using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = false, + Delimiter = ",", + Encoding = Encoding.UTF8 + }); + while (csv.Read()) + { + var personId = (long)csv.GetField(typeof(long), 0); + + chunk.Value[personId].Add(o.Key); + } + Interlocked.Increment(ref cnt); + }); + + int wrngCnt = 0; + var wrongPersonIds = new Dictionary(); + foreach (var ci in chunk.Value) + { + if (ci.Value.Count == 0) + { + missed++; + } + else if (ci.Value.Count > 1) + { + dups++; + } + + if (ci.Value.Count != 1) + { + wrngCnt++; + + if (wrngCnt == 1 || wrngCnt % 500 == 0) + wrongPersonIds.Add(ci.Key, false); + } + } + + timer.Stop(); + + if (missed > 0 || dups > 0) + { + Console.WriteLine($"XXX ChunkId={chunk.Key} | missed={missed}; dups={dups} | {wrongPersonIds.Keys.Count}"); + if (!onlyCheck) + { + Cleanup(chunk.Key, slicesNum); + var tasks = _lambdaUtility.TriggerBuildFunction(Settings.Current.Building.Vendor, Settings.Current.Building.Id.Value, chunk.Key, false); + Task.WaitAll([.. tasks]); + + var checkCreation = Task.Run(() => _lambdaUtility.AllChunksWereDone(Settings.Current.Building.Vendor, + Settings.Current.Building.Id.Value, _lambdaUtility.BuildMessageBucket)); + + checkCreation.Wait(); + + foreach (var personId in chunk.Value.Keys) + { + chunk.Value[personId].Clear(); + } + + throw new Exception("restart"); + } + } + + complete = true; + return wrongPersonIds; + } + catch (Exception ex) + { + Console.Write(ex.Message + " | [ProcessChunk] Exception | new attempt | attempt=" + attempt); + if (attempt > 3) + { + throw; + } + } + } + return null; + } + + private void Cleanup(int chunkId, int slicesNum) + { + var tables = new[] + { + "PERSON", + "OBSERVATION_PERIOD", + "PAYER_PLAN_PERIOD", + "DEATH", + "DRUG_EXPOSURE", + "OBSERVATION", + "VISIT_OCCURRENCE", + "VISIT_DETAIL", + "PROCEDURE_OCCURRENCE", + "DRUG_ERA", + "CONDITION_ERA", + "DEVICE_EXPOSURE", + "MEASUREMENT", + "COHORT", + "CONDITION_OCCURRENCE", + "COST", + "NOTE", + "METADATA_TMP", + "FACT_RELATIONSHIP" + }; + + Console.WriteLine("Cleaning chunkId=" + chunkId); + + foreach (var table in tables) + { + Console.WriteLine("Cleaning table=" + table); + + for (var i = 0; i < slicesNum; i++) + { + Clean(chunkId, table, i); + } + } + + Console.WriteLine($"chunkId={chunkId} was cleaned"); + } + + private static IEnumerable GetLines(Stream stream) + { + using var bufferedStream = new BufferedStream(stream); + using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); + using var reader = new StreamReader(gzipStream, Encoding.Default); + string line; + while ((line = reader.ReadLine()) != null) + { + if (!string.IsNullOrEmpty(line)) + { + yield return line; + } + } + } + + public static List FindSlice(int chunkId, string table, Dictionary personIds, int personIndex) + { + var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/raw/{chunkId}/{table}/{table}"; + + var result = new HashSet(); + using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, + Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = Settings.Current.Bucket, + Prefix = prefix + }; + + var r = client.ListObjectsV2Async(request); + r.Wait(); + var response = r.Result; + var rows = new List(); + foreach (var o in response.S3Objects) + { + using var transferUtility = new TransferUtility(Settings.Current.S3AwsAccessKeyId, + Settings.Current.S3AwsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(Settings.Current.Bucket, o.Key); + { + foreach (var line in GetLines(responseStream)) + { + long personId = long.Parse(line.Split('\t')[personIndex]); + if (personIds.ContainsKey(personId)) + { + result.Add(o.Key); + break; + } + } + } + } + } + + return [.. result]; + } + + public void Clean(int chunkId, string table, int slice) + { + var attempt = 0; + var complete = false; + + while (!complete) + { + try + { + attempt++; + + var perfix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/{_cdmFolder}/{table}/{table}.{slice}.{chunkId}."; + + using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, + Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = Settings.Current.Bucket, + Prefix = perfix + }; + ListObjectsV2Response response; + do + { + using var getListObjects = client.ListObjectsV2Async(request); + getListObjects.Wait(); + response = getListObjects.Result; + + var multiObjectDeleteRequest = new DeleteObjectsRequest + { + BucketName = Settings.Current.Bucket + }; + + foreach (var o in response.S3Objects) + { + multiObjectDeleteRequest.AddKey(o.Key, null); + } + + if (response.S3Objects.Count > 0) + { + using var deleteObjects = client.DeleteObjectsAsync(multiObjectDeleteRequest); + deleteObjects.Wait(); + + //Console.WriteLine(response.S3Objects.Count + " files deleted"); + } + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated == true); + } + + complete = true; + } + catch (Exception ex) + { + Console.Write(" | [Clean] Exception | new attempt | " + attempt); + Console.WriteLine(ex.Message); + if (attempt > 3) + { + throw; + } + } + } + } + } +} \ No newline at end of file diff --git a/sources/org.ohdsi.cdm.sln b/sources/org.ohdsi.cdm.sln index bfbdcbef..24ca9d14 100644 --- a/sources/org.ohdsi.cdm.sln +++ b/sources/org.ohdsi.cdm.sln @@ -28,7 +28,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RunLocal", "Tests\RunLocal\ EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "org.ohdsi.cdm.framework.etl", "Framework\org.ohdsi.cdm.framework.etl\org.ohdsi.cdm.framework.etl.csproj", "{8460548C-48CF-4673-BB8C-46A7BDBE0BDA}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "org.ohdsi.cdm.framework", "Framework\org.ohdsi.cdm.framework\org.ohdsi.cdm.framework.csproj", "{214F9843-9033-489C-B26F-5D4AB45F4C52}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "org.ohdsi.cdm.framework", "Framework\org.ohdsi.cdm.framework\org.ohdsi.cdm.framework.csproj", "{214F9843-9033-489C-B26F-5D4AB45F4C52}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RunValidation", "RunValidation\RunValidation.csproj", "{52816524-F59A-49D4-A59E-622590CF2857}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -214,6 +216,30 @@ Global {214F9843-9033-489C-B26F-5D4AB45F4C52}.x64 Solution Configuration|x64.Build.0 = Debug|Any CPU {214F9843-9033-489C-B26F-5D4AB45F4C52}.x64 Solution Configuration|x86.ActiveCfg = Debug|Any CPU {214F9843-9033-489C-B26F-5D4AB45F4C52}.x64 Solution Configuration|x86.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|Any CPU.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|x64.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|x64.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|x86.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Debug|x86.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|Any CPU.ActiveCfg = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|Any CPU.Build.0 = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|x64.ActiveCfg = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|x64.Build.0 = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|x86.ActiveCfg = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.Release|x86.Build.0 = Release|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|Any CPU.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|Any CPU.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|Mixed Platforms.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|Mixed Platforms.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|x64.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|x64.Build.0 = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|x86.ActiveCfg = Debug|Any CPU + {52816524-F59A-49D4-A59E-622590CF2857}.x64 Solution Configuration|x86.Build.0 = Debug|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -226,6 +252,7 @@ Global {C357C5F3-1F29-45A8-B13C-3B1D30CF4295} = {69D5B1B9-1EF1-47EC-8FAF-88D5D2CF0A2C} {8460548C-48CF-4673-BB8C-46A7BDBE0BDA} = {29F8844D-3B39-4CC7-8502-19310854A064} {214F9843-9033-489C-B26F-5D4AB45F4C52} = {29F8844D-3B39-4CC7-8502-19310854A064} + {52816524-F59A-49D4-A59E-622590CF2857} = {69D5B1B9-1EF1-47EC-8FAF-88D5D2CF0A2C} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {1A9F9E1B-42CB-4795-9257-228BA3F6A766} From ee477ef29af54e7bcef60413ae396df1e2536f59 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Fri, 18 Oct 2024 14:42:33 +0200 Subject: [PATCH 02/37] .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d684824e..34090ae4 100644 --- a/.gitignore +++ b/.gitignore @@ -346,3 +346,4 @@ ETL-LambdaBuilder.Rproj sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/aws-lambda-tools-defaults.json sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/aws-lambda-tools-defaults.json /sources/Tests/RunLocal/App.config +/sources/RunValidation/App.config From 52e3dfc03c46971494d88492c330687bce270351 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Fri, 18 Oct 2024 20:16:55 +0200 Subject: [PATCH 03/37] Basic logic. Implement getting files from S3 for Person and metadata_tmp. --- sources/RunValidation/Helper.cs | 22 +- sources/RunValidation/Program.cs | 96 +++++- sources/RunValidation/RunValidation.csproj | 4 + .../RunLocal => RunValidation}/Validation.cs | 73 ++++- sources/RunValidation/Validation1.cs | 103 ------ sources/RunValidation/Validation2.cs | 1 + sources/Tests/RunLocal/Helper.cs | 308 ------------------ sources/Tests/RunLocal/Program.cs | 6 - 8 files changed, 172 insertions(+), 441 deletions(-) rename sources/{Tests/RunLocal => RunValidation}/Validation.cs (57%) delete mode 100644 sources/RunValidation/Validation1.cs delete mode 100644 sources/Tests/RunLocal/Helper.cs diff --git a/sources/RunValidation/Helper.cs b/sources/RunValidation/Helper.cs index 84993e1a..8a01f64c 100644 --- a/sources/RunValidation/Helper.cs +++ b/sources/RunValidation/Helper.cs @@ -139,11 +139,16 @@ internal static void CheckChunk(string localTmpPath, List objects, str } internal static IEnumerable> GetObjectsFromS3(Vendor vendor, int buildingId, string awsAccessKeyId, string awsSecretAccessKey, - string bucket, string cdmFolder, string table, int chunkId, int slicesNum) + string bucket, string cdmFolder, string table, int chunkId, IEnumerable slices, bool skipObjectCountChecking = false) { - for (int i = 0; i < slicesNum; i++) + var orderedSlices = slices.OrderBy(s => s).ToList(); + bool stop = false; + for (int i = 0; i < orderedSlices.Count; i++) { - var prefix = $"{vendor}/{buildingId}/{cdmFolder}/{table}/{table}.{i}.{chunkId}."; + if (stop) + break; + + var prefix = $"{vendor}/{buildingId}/{cdmFolder}/{table}/{table}.{orderedSlices[i]}.{chunkId}."; using var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); @@ -157,11 +162,18 @@ internal static IEnumerable> GetObjectsFromS3(Vendor vendor, int { var responseTask = client.ListObjectsV2Async(request); responseTask.Wait(); - response = responseTask.Result; + response = responseTask.Result; yield return response.S3Objects; request.ContinuationToken = response.NextContinuationToken; + + //assume that all objects are not sparse, so no other object is expected with number more that the previous one + if (!skipObjectCountChecking && response.S3Objects.Count == 0) + { + stop = true; + break; + } } while (response.IsTruncated); } @@ -197,7 +209,7 @@ internal static List FindSlicesByPersonIds(string awsAccessKeyId, string { using var transferUtility = new TransferUtility(awsAccessKeyId, awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); using var responseStream = transferUtility.OpenStream(bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); + using var bufferedStream = new BufferedStream(responseStream); using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); using var reader = new StreamReader(gzipStream, Encoding.Default); string line; diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index e48a3e7d..1abfdc70 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -1,10 +1,102 @@ -namespace RunValidation +using Amazon.S3.Model; +using CommandLine.Text; +using CommandLine; +using org.ohdsi.cdm.framework.common.Enums; +using org.ohdsi.cdm.framework.common.Utility; +using System.Configuration; +using System.ComponentModel.Design; + +namespace RunValidation { internal class Program { + internal class Options + { + [Option('v', "vendor", Required = true, HelpText = "Vendor name.")] + public required string Vendor { get; set; } + + [Option('b', "buildingId", Required = true, HelpText = "Building ID.")] + public required int BuildingId { get; set; } + + [Option('e', "etlLibraryPath", Default = "", HelpText = "(Optional) Path to a folder containing an external ETL .dll")] + public string EtlLibraryPath { get; set; } = ""; + + [Option('l', "localTmpPath", Default = "C:\\_tmp", HelpText = "(Optional) Path to local folder to contain intermediary data")] + public string LocalTmpPath { get; set; } = "C:\\_tmp"; + + [Option('c', "chunks", Separator = ',', HelpText = "(Optional) Comma-separated list of chunk IDs to process. All of them, if omitted.")] + public IEnumerable Chunks { get; set; } = new List(); + + [Option('s', "slices", Separator = ',', HelpText = "(Optional) Comma-separated list of slice IDs to process for each chunkId. 100, if omitted.")] + public IEnumerable Slices { get; set; } = new List(); + + [Usage(ApplicationAlias = "RunValidation")] + public static IEnumerable Examples + { + get + { + yield return new Example("Process all chunks", new Options + { Vendor = "VendorName", BuildingId = 123}); + yield return new Example("Process all chunks for an external .dll", new Options + { Vendor = "ExternalVendorName", BuildingId = 123, EtlLibraryPath = "C:\\PathToExternalDllFolder"}); + yield return new Example("Process specified chunks", new Options + { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1, 2, 3 } }); + yield return new Example("Process specified slices for specified chunks", new Options + { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1, 2, 3 }, Slices = new List { 1, 2, 3 } }); + } + } + } + + private static string _awsAccessKeyId => ConfigurationManager.AppSettings["awsAccessKeyId"] ?? throw new NullReferenceException("awsAccessKeyId"); + private static string _awsSecretAccessKey => ConfigurationManager.AppSettings["awsSecretAccessKey"] ?? throw new NullReferenceException("awsSecretAccessKey"); + private static string _bucket => ConfigurationManager.AppSettings["bucket"] ?? throw new NullReferenceException("bucket"); + private static string _cdmFolder => ConfigurationManager.AppSettings["cdmFolder"] ?? throw new NullReferenceException("cdmFolder"); + static void Main(string[] args) { - Console.WriteLine("Hello, World!"); + Parser.Default.ParseArguments(args) + .WithParsed(RunWithOptions) + .WithNotParsed(HandleParseError); + + Console.ReadLine(); + } + + static void RunWithOptions(Options opts) + { + var chunks = opts.Chunks ?? Enumerable.Empty(); + var slices = opts.Slices ?? Enumerable.Empty(); + + Console.WriteLine("Options:"); + Console.WriteLine($"Keys: {_awsAccessKeyId} - {_awsSecretAccessKey}"); + Console.WriteLine($"Bucket - folder: {_bucket} - {_cdmFolder}"); + + Console.WriteLine($"Vendor: {opts.Vendor}"); + Console.WriteLine($"Building ID: {opts.BuildingId}"); + Console.WriteLine($"EtlLibraryPath: {opts.EtlLibraryPath}"); + Console.WriteLine($"LocalTmpPath: {opts.LocalTmpPath}"); + Console.WriteLine($"Chunks: {string.Join(", ", chunks)}"); + Console.WriteLine($"Slices: {string.Join(", ", slices)}"); + Console.WriteLine($"Current directory: {Directory.GetCurrentDirectory()}"); + Console.WriteLine(); + + //int[] slicesNum = [24, 40, 48, 96, 192]; + + //var localTmpPath = "C:\\_tmp"; + //var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, localTmpPath); + //validation.Start((Vendor)Enum.Parse(typeof(Vendor), args[0]), int.Parse(args[1]), slicesNum[0], _cdmFolder); + Vendor vendor = EtlLibrary.CreateVendorInstance(opts.EtlLibraryPath, opts.Vendor); + var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, opts.LocalTmpPath, _cdmFolder); + validation.ValidateBuildingId(vendor, opts.BuildingId, chunks, slices); + } + + static void HandleParseError(IEnumerable errs) + { + // Handle errors + Console.WriteLine("Failed to parse command-line arguments."); + foreach (var error in errs) + { + Console.WriteLine(error.ToString()); + } } } } diff --git a/sources/RunValidation/RunValidation.csproj b/sources/RunValidation/RunValidation.csproj index 1adb2faa..fd2baeaa 100644 --- a/sources/RunValidation/RunValidation.csproj +++ b/sources/RunValidation/RunValidation.csproj @@ -7,6 +7,10 @@ enable + + + + diff --git a/sources/Tests/RunLocal/Validation.cs b/sources/RunValidation/Validation.cs similarity index 57% rename from sources/Tests/RunLocal/Validation.cs rename to sources/RunValidation/Validation.cs index f7aa3f27..bdbe4465 100644 --- a/sources/Tests/RunLocal/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -4,22 +4,35 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; +using System.Numerics; +using System.Runtime.InteropServices; -namespace RunLocal +namespace RunValidation { - public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string bucket, string tmpFolder) + public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string bucket, string tmpFolder, string cdmFolder) { + #region Fields + private readonly string _awsAccessKeyId = awsAccessKeyId; private readonly string _awsSecretAccessKey = awsSecretAccessKey; private readonly string _bucket = bucket; private readonly string _tmpFolder = tmpFolder; - private string _cdmFolder; + private readonly string _cdmFolder = cdmFolder; - public void Start(Vendor vendor, int buildingId, int slicesNum, string cdmFolder) - { - _cdmFolder = cdmFolder; + #endregion + + #region Methods + /// + /// Method to check the correctness of person ids for specified groups of vendor + buildingId + chunkId + slicesNum + /// + /// + /// + /// If omitted or null, all chunkIds on S3 are checked + /// Slices within a chunk to process + public void ValidateBuildingId(Vendor vendor, int buildingId, IEnumerable chunkIds, IEnumerable slices) + { Console.WriteLine($"{vendor}.{buildingId}"); List wrong = []; HashSet missed = []; @@ -33,24 +46,36 @@ public void Start(Vendor vendor, int buildingId, int slicesNum, string cdmFolder foreach (var chunk in Helper.GetChunksFromS3(_tmpFolder, vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket)) { var chunkId = chunk.Key; + if (chunkIds != null && chunkIds.Any() && !chunkIds.Any(s => s == chunkId)) + { + Console.WriteLine("Skip chunkId " + chunkId); + continue; + } + #region var objects = new List(); from PERSON and METADATA_TMP var objects = new List(); + + var slices2process = (slices == null || !slices.Any()) + ? Enumerable.Range(1, 100).ToList() + : slices.Distinct().OrderBy(s => s).ToList() + ; + foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, - _cdmFolder, "PERSON", chunkId, slicesNum)) + _cdmFolder, "PERSON", chunkId, slices2process)) { objects.AddRange(o); } + //this is to exclude slices, which definetely won't be in metadata + //is such a case possible when there's no file in PERSON, but a file in METADATA for a single slice? + var slices2processInMetadata = objects.Select(s => int.Parse(s.Key.Split(new[] { '/' }).Last().Split(new[] { '.' })[1])).ToList(); + foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, - _cdmFolder, "METADATA_TMP", chunkId, slicesNum)) + _cdmFolder, "METADATA_TMP", chunkId, slices2process, true)) { objects.AddRange(o); } - - if (objects.Count == 0) - { - wrong.Add($"chunkId={chunkId} - MISSED"); - } + #endregion Helper.CheckChunk(_tmpFolder, objects, _awsAccessKeyId, _awsSecretAccessKey, _bucket, chunk); @@ -91,12 +116,26 @@ public void Start(Vendor vendor, int buildingId, int slicesNum, string cdmFolder Console.WriteLine(fileName); } } + + Console.WriteLine(); + timer.Stop(); + Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); + timer.Restart(); } + } - Console.WriteLine(); - timer.Stop(); - Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); - timer.Restart(); + int ParseSliceIdFromKey(string key) + { + var parts = key.Split(new[] { '.' }, StringSplitOptions.RemoveEmptyEntries); + // Assuming sliceId is at a specific position in the parts array + if (parts.Length == 6 && int.TryParse(parts[1], out int sliceId)) + { + return sliceId; + } + throw new FormatException($"Invalid S3 object key format: {key}"); } + + #endregion + } } \ No newline at end of file diff --git a/sources/RunValidation/Validation1.cs b/sources/RunValidation/Validation1.cs deleted file mode 100644 index 83e8a307..00000000 --- a/sources/RunValidation/Validation1.cs +++ /dev/null @@ -1,103 +0,0 @@ -using Amazon.S3.Model; -using org.ohdsi.cdm.framework.common.Enums; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Numerics; - -namespace RunValidation -{ - - public class Validation1(string awsAccessKeyId, string awsSecretAccessKey, string bucket, string tmpFolder) - { - private readonly string _awsAccessKeyId = awsAccessKeyId; - private readonly string _awsSecretAccessKey = awsSecretAccessKey; - private readonly string _bucket = bucket; - private readonly string _tmpFolder = tmpFolder; - private string _cdmFolder; - - public void Start(Vendor vendor, int buildingId, int slicesNum, string cdmFolder) - { - _cdmFolder = cdmFolder; - - Console.WriteLine($"{vendor}.{buildingId}"); - List wrong = []; - HashSet missed = []; - var cIds = new HashSet(); - var pIds = new HashSet(); - var f = new HashSet(); - var s = new HashSet(); - var timer = new Stopwatch(); - timer.Start(); - - foreach (var chunk in Helper.GetChunksFromS3(_tmpFolder, vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket)) - { - var chunkId = chunk.Key; - - var objects = new List(); - foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, - _cdmFolder, "PERSON", chunkId, slicesNum)) - { - objects.AddRange(o); - } - - foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, - _cdmFolder, "METADATA_TMP", chunkId, slicesNum)) - { - objects.AddRange(o); - } - - if (objects.Count == 0) - { - wrong.Add($"chunkId={chunkId} - MISSED"); - } - - Helper.CheckChunk(_tmpFolder, objects, _awsAccessKeyId, _awsSecretAccessKey, _bucket, chunk); - - int missedCnt = 0; - var missedPersonIds = new Dictionary(); - - foreach (var c in chunk.Value) - { - if (c.Value.Count != 1) - { - missedCnt++; - - if (missedCnt == 1 || missedCnt % 500 == 0) - missedPersonIds.Add(c.Key, false); - } - - if (c.Value.Count != 1) - { - wrong.Add($"chunkId={chunkId};person_id={c.Key};filese={string.Join(',', [.. c.Value])}"); - cIds.Add(chunkId); - pIds.Add(c.Key); - - foreach (var v in c.Value) - { - f.Add(v); - s.Add($@"done.Add(Process(vendor, buildingId, {chunkId}, ""{Int32.Parse(v.Split('/')[4].Split('.')[1]):0000}"", true));"); - } - } - } - - if (missedPersonIds.Count > 0) - { - foreach (var r in Helper.FindSlicesByPersonIds(_awsAccessKeyId, _awsSecretAccessKey, _bucket, vendor, buildingId, chunkId, vendor.PersonTableName, missedPersonIds, vendor.PersonIdIndex)) - { - missed.Add(r); - var fileName = r.Replace(@"\", "_").Replace(@"/", "_"); - File.Create($@"{_tmpFolder}\{fileName}.txt").Dispose(); - Console.WriteLine(fileName); - } - } - } - - Console.WriteLine(); - timer.Stop(); - Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); - timer.Restart(); - } - } -} \ No newline at end of file diff --git a/sources/RunValidation/Validation2.cs b/sources/RunValidation/Validation2.cs index 2cd32cb5..c9052456 100644 --- a/sources/RunValidation/Validation2.cs +++ b/sources/RunValidation/Validation2.cs @@ -12,6 +12,7 @@ namespace RunValidation { + [Obsolete] //it's here for a reference; //todo delete //copy from Presentation public class Validation2 { private string _cdmFolder = "cdmCSV"; diff --git a/sources/Tests/RunLocal/Helper.cs b/sources/Tests/RunLocal/Helper.cs deleted file mode 100644 index d2e6f720..00000000 --- a/sources/Tests/RunLocal/Helper.cs +++ /dev/null @@ -1,308 +0,0 @@ -using Amazon.S3; -using Amazon.S3.Model; -using Amazon.S3.Transfer; -using CsvHelper.Configuration; -using CsvHelper; -using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Diagnostics; -using System.Globalization; -using System.IO.Compression; -using System.IO; -using System.Linq; -using System.Text; -using System.Threading; -using System.Threading.Tasks; -using org.ohdsi.cdm.framework.common.Enums; - -namespace RunLocal -{ - internal class Helper - { - internal static void CheckChunk(string localTmpPath, List objects, string awsAccessKeyId, string awsSecretAccessKey, string bucket, - KeyValuePair>> chunk) - { - - var missed = 0; - var dups = 0; - - var attempt = 0; - var complete = false; - - var config = new AmazonS3Config - { - Timeout = TimeSpan.FromMinutes(60), - RegionEndpoint = Amazon.RegionEndpoint.USEast1, - BufferSize = 1512 * 1024, - MaxErrorRetry = 120 - }; - - while (!complete) - { - try - { - attempt++; - - var timer = new Stopwatch(); - timer.Start(); - - var cnt = 0; - var attempt1 = attempt; - - Parallel.ForEach(objects, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, o => - { - var loadAttempt = 0; - var loaded = false; - while (!loaded) - { - try - { - loadAttempt++; - using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, config)) - using (var transferUtility = new TransferUtility(client)) - { - transferUtility.Download($@"{localTmpPath}\{o.Key}", bucket, o.Key); - } - loaded = true; - } - catch (Exception) - { - if (loadAttempt <= 11) - { - Console.WriteLine(o.Key + " | " + loadAttempt); - } - else - { - throw; - } - } - } - - - using (var responseStream = File.Open($@"{localTmpPath}\{o.Key}", FileMode.Open)) - using (var bufferedStream = new BufferedStream(responseStream)) - using (var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress)) - using (var reader = new StreamReader(gzipStream, Encoding.Default)) - { - using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) - { - HasHeaderRecord = false, - Delimiter = ",", - Encoding = Encoding.UTF8 - }); - while (csv.Read()) - { - var personId = (long)csv.GetField(typeof(long), 0); - - chunk.Value[personId].Add(o.Key); - } - - Interlocked.Increment(ref cnt); - - Console.Write( - $"\rchunkId={chunk.Key} | {cnt} from {objects.Count} | attempt={attempt1}"); - } - - File.Delete($@"{localTmpPath}\{o.Key}"); - - }); - - - foreach (var ci in chunk.Value) - { - if (ci.Value.Count == 0) - { - missed++; - } - else if (ci.Value.Count > 1) - { - dups++; - } - } - - timer.Stop(); - Console.WriteLine($" | DONE | missed={missed}; dups={dups} | total={timer.ElapsedMilliseconds}ms"); - - complete = true; - } - catch (Exception) - { - Console.Write(" | Exception"); - if (attempt > 3) - { - throw; - } - } - } - } - - internal static IEnumerable> GetObjectsFromS3(Vendor vendor, int buildingId, string awsAccessKeyId, string awsSecretAccessKey, - string bucket, string cdmFolder, string table, int chunkId, int slicesNum) - { - for (int i = 0; i < slicesNum; i++) - { - var prefix = $"{vendor}/{buildingId}/{cdmFolder}/{table}/{table}.{i}.{chunkId}."; - - using var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, - Amazon.RegionEndpoint.USEast1); - var request = new ListObjectsV2Request - { - BucketName = bucket, - Prefix = prefix - }; - ListObjectsV2Response response; - do - { - var responseTask = client.ListObjectsV2Async(request); - responseTask.Wait(); - response = responseTask.Result; - - yield return response.S3Objects; - - request.ContinuationToken = response.NextContinuationToken; - } while (response.IsTruncated); - } - - } - - internal static List FindSlicesByPersonIds(string awsAccessKeyId, string awsSecretAccessKey, string bucket, Vendor vendor, int buildingId, int chunkId, string table, Dictionary personIds, int personIndex) - { - var prefix = $"{vendor}/{buildingId}/raw/{chunkId}/{table}/{table}"; - - var input = new ConcurrentDictionary(); - - foreach (var pId in personIds.Keys) - { - input.TryAdd(pId, false); - } - - var result = new ConcurrentDictionary(); - using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = bucket, - Prefix = prefix - }; - - var r = client.ListObjectsV2Async(request); - r.Wait(); - var response = r.Result; - var rows = new List(); - - Parallel.ForEach(response.S3Objects, o => - { - using var transferUtility = new TransferUtility(awsAccessKeyId, awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); - using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); - using var reader = new StreamReader(gzipStream, Encoding.Default); - string line; - while ((line = reader.ReadLine()) != null) - { - if (input.IsEmpty) - break; - - if (!string.IsNullOrEmpty(line)) - { - long personId = long.Parse(line.Split('\t')[personIndex]); - if (personIds.ContainsKey(personId)) - { - result.TryAdd(o.Key, false); - input.TryRemove(personId, out var res); - break; - } - } - } - }); - } - - return [.. result.Keys]; - } - - internal static IEnumerable>>> GetChunksFromS3(string localTmpPath, Vendor vendor, int buildingId, - string awsAccessKeyId, string awsSecretAccessKey, - string bucket) - { - var currentChunkId = 0; - var result = new KeyValuePair>>(0, []); - var prefix = $"{vendor}/{buildingId}/_chunks"; - using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = bucket, - Prefix = prefix - }; - - var response = client.ListObjectsV2Async(request); - response.Wait(); - - foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) - { - var loadAttempt = 0; - var loaded = false; - while (!loaded) - { - try - { - loadAttempt++; - using (var transferUtility = new TransferUtility(client)) - { - transferUtility.Download($@"{localTmpPath}\{o.Key}", bucket, o.Key); - } - loaded = true; - } - catch (Exception) - { - if (loadAttempt <= 11) - { - Console.WriteLine(o.Key + " | " + loadAttempt); - } - else - { - throw; - } - } - } - - using (var responseStream = File.Open($@"{localTmpPath}\{o.Key}", FileMode.Open)) - using (var bufferedStream = new BufferedStream(responseStream)) - using (var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress)) - using (var reader = new StreamReader(gzipStream, Encoding.Default)) - { - string line; - while ((line = reader.ReadLine()) != null) - { - if (!string.IsNullOrEmpty(line)) - { - var chunkId = int.Parse(line.Split('\t')[0]); - - if (currentChunkId != chunkId) - { - if (result.Value.Count > 0) - yield return result; - - result = new KeyValuePair>>(chunkId, - []); - currentChunkId = chunkId; - } - - var personId = long.Parse(line.Split('\t')[1]); - result.Value.Add(personId, []); - } - } - } - - File.Delete($@"{localTmpPath}\{o.Key}"); - } - } - - if (result.Value.Count > 0) - yield return result; - } - } -} diff --git a/sources/Tests/RunLocal/Program.cs b/sources/Tests/RunLocal/Program.cs index 39edec0a..3efb76c1 100644 --- a/sources/Tests/RunLocal/Program.cs +++ b/sources/Tests/RunLocal/Program.cs @@ -29,12 +29,6 @@ static void Main(string[] args) Process(EtlLibrary.CreateVendorInstance(args[5], args[0]), int.Parse(args[1]), int.Parse(args[2]), args[3], bool.Parse(args[4]), args[5]); - //int[] slicesNum = [24, 40, 48, 96, 192]; - - //var localTmpPath = "C:\\_tmp"; - //var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, localTmpPath); - //validation.Start((Vendor)Enum.Parse(typeof(Vendor), args[0]), int.Parse(args[1]), slicesNum[0], _cdmFolder); - Console.WriteLine("DONE"); Console.ReadLine(); } From f01e936c599543dac5a7089a3a7f05d1df379500 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Fri, 18 Oct 2024 20:27:05 +0200 Subject: [PATCH 04/37] Standartize ParseSliceIdFromKey --- sources/RunValidation/Validation.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index bdbe4465..88b7f031 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -66,9 +66,9 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, IEnumerable c objects.AddRange(o); } - //this is to exclude slices, which definetely won't be in metadata - //is such a case possible when there's no file in PERSON, but a file in METADATA for a single slice? - var slices2processInMetadata = objects.Select(s => int.Parse(s.Key.Split(new[] { '/' }).Last().Split(new[] { '.' })[1])).ToList(); + // this is to exclude slices, which definetely won't be in metadata + // is such a case possible when there's no file in PERSON, but a file in METADATA for a single slice? + var slices2processInMetadata = objects.Select(s => parseSliceIdFromKey(s.Key)).ToList(); foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, _cdmFolder, "METADATA_TMP", chunkId, slices2process, true)) @@ -101,7 +101,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, IEnumerable c foreach (var v in c.Value) { f.Add(v); - s.Add($@"done.Add(Process(vendor, buildingId, {chunkId}, ""{Int32.Parse(v.Split('/')[4].Split('.')[1]):0000}"", true));"); + s.Add($@"done.Add(Process(vendor, buildingId, {chunkId}, ""{parseSliceIdFromKey(v):0000}"", true));"); } } } @@ -124,10 +124,10 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, IEnumerable c } } - int ParseSliceIdFromKey(string key) + int parseSliceIdFromKey(string key) { var parts = key.Split(new[] { '.' }, StringSplitOptions.RemoveEmptyEntries); - // Assuming sliceId is at a specific position in the parts array + // assuming sliceId is at a specific position in the parts array if (parts.Length == 6 && int.TryParse(parts[1], out int sliceId)) { return sliceId; From e682f0d8e9f175dff900e5dceb305ae4932b924b Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Mon, 21 Oct 2024 17:16:29 +0200 Subject: [PATCH 05/37] Validation from RunLocal done --- .../Common/Utility/EtlLibrary.cs | 1 + sources/RunValidation/Helper.cs | 30 +++++++++++-------- sources/RunValidation/RunValidation.csproj | 3 +- sources/RunValidation/Validation.cs | 6 +--- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index 92ce260e..be26d760 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -128,6 +128,7 @@ public static Vendor CreateVendorInstance(string etlLibraryPath, string name) { Console.WriteLine("CreateVendorInstance | assembly: " + assembly.GetName().Name); Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); + Console.WriteLine(); return Activator.CreateInstance(vendorType) as Vendor; } diff --git a/sources/RunValidation/Helper.cs b/sources/RunValidation/Helper.cs index 8a01f64c..51670047 100644 --- a/sources/RunValidation/Helper.cs +++ b/sources/RunValidation/Helper.cs @@ -15,6 +15,9 @@ using System.Threading; using System.Threading.Tasks; using org.ohdsi.cdm.framework.common.Enums; +using org.ohdsi.cdm.presentation.lambdabuilder; +using ZstdSharp; +using Amazon.Runtime.Internal.Util; namespace RunValidation { @@ -209,25 +212,26 @@ internal static List FindSlicesByPersonIds(string awsAccessKeyId, string { using var transferUtility = new TransferUtility(awsAccessKeyId, awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); using var responseStream = transferUtility.OpenStream(bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); - using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); - using var reader = new StreamReader(gzipStream, Encoding.Default); - string line; - while ((line = reader.ReadLine()) != null) + using var bufferedStream = new BufferedStream(responseStream); + using Stream stream = o.Key.EndsWith(".gz") + ? new GZipStream(bufferedStream, CompressionMode.Decompress) + : new DecompressionStream(bufferedStream) //.zst + ; + using var reader = new StreamReader(stream, Encoding.Default); + string? line = reader.ReadLine(); + while (!string.IsNullOrEmpty(line)) { if (input.IsEmpty) break; - if (!string.IsNullOrEmpty(line)) + long personId = long.Parse(line.Split('\t')[personIndex]); + if (personIds.ContainsKey(personId)) { - long personId = long.Parse(line.Split('\t')[personIndex]); - if (personIds.ContainsKey(personId)) - { - result.TryAdd(o.Key, false); - input.TryRemove(personId, out var res); - break; - } + result.TryAdd(o.Key, false); + input.TryRemove(personId, out var res); + break; } + line = reader.ReadLine(); } }); } diff --git a/sources/RunValidation/RunValidation.csproj b/sources/RunValidation/RunValidation.csproj index fd2baeaa..69bd35ab 100644 --- a/sources/RunValidation/RunValidation.csproj +++ b/sources/RunValidation/RunValidation.csproj @@ -2,7 +2,7 @@ Exe - net8.0 + net8.0-windows enable enable @@ -13,6 +13,7 @@ + diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 88b7f031..d283f031 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -25,7 +25,7 @@ public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string #region Methods /// - /// Method to check the correctness of person ids for specified groups of vendor + buildingId + chunkId + slicesNum + /// Method to check the correctness of person ids for specified groups of vendor + buildingId + chunkId + slices /// /// /// @@ -66,10 +66,6 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, IEnumerable c objects.AddRange(o); } - // this is to exclude slices, which definetely won't be in metadata - // is such a case possible when there's no file in PERSON, but a file in METADATA for a single slice? - var slices2processInMetadata = objects.Select(s => parseSliceIdFromKey(s.Key)).ToList(); - foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, _cdmFolder, "METADATA_TMP", chunkId, slices2process, true)) { From 58a3ea32e852a87df8041795956d80b9eae82266 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Mon, 21 Oct 2024 19:10:01 +0200 Subject: [PATCH 06/37] Replace old Validation from RunLocal with seemingly newer from Lambdabuilder --- sources/RunValidation/Program.cs | 2 +- sources/RunValidation/Validation.cs | 457 ++++++++++++++++++++++++---- 2 files changed, 400 insertions(+), 59 deletions(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 1abfdc70..98ee0752 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -50,7 +50,7 @@ public static IEnumerable Examples private static string _awsAccessKeyId => ConfigurationManager.AppSettings["awsAccessKeyId"] ?? throw new NullReferenceException("awsAccessKeyId"); private static string _awsSecretAccessKey => ConfigurationManager.AppSettings["awsSecretAccessKey"] ?? throw new NullReferenceException("awsSecretAccessKey"); private static string _bucket => ConfigurationManager.AppSettings["bucket"] ?? throw new NullReferenceException("bucket"); - private static string _cdmFolder => ConfigurationManager.AppSettings["cdmFolder"] ?? throw new NullReferenceException("cdmFolder"); + private static string _cdmFolder => ConfigurationManager.AppSettings["cdmFolder"] ?? "cdmCSV"; static void Main(string[] args) { diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index d283f031..56171294 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -1,11 +1,20 @@ -using Amazon.S3.Model; +using Amazon.S3; +using Amazon.S3.Model; +using Amazon.S3.Transfer; +using CsvHelper.Configuration; +using CsvHelper; using org.ohdsi.cdm.framework.common.Enums; +using org.ohdsi.cdm.framework.common.Helpers; using System; using System.Collections.Generic; using System.Diagnostics; +using System.Globalization; using System.IO; +using System.IO.Compression; using System.Numerics; using System.Runtime.InteropServices; +using System.Text; +using ZstdSharp; namespace RunValidation { @@ -19,116 +28,448 @@ public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string private readonly string _bucket = bucket; private readonly string _tmpFolder = tmpFolder; private readonly string _cdmFolder = cdmFolder; + private readonly LambdaUtility _lambdaUtility = + new LambdaUtility(awsAccessKeyId, awsSecretAccessKey, awsAccessKeyId, awsSecretAccessKey, bucket, bucket, bucket, cdmFolder); #endregion #region Methods - /// - /// Method to check the correctness of person ids for specified groups of vendor + buildingId + chunkId + slices - /// - /// - /// - /// If omitted or null, all chunkIds on S3 are checked - /// Slices within a chunk to process - public void ValidateBuildingId(Vendor vendor, int buildingId, IEnumerable chunkIds, IEnumerable slices) + public void ValidateBuildingId(Vendor vendor, int buildingIdToCheck, IEnumerable chunkIdsToCheck, IEnumerable slicesToCheck) { - Console.WriteLine($"{vendor}.{buildingId}"); - List wrong = []; - HashSet missed = []; - var cIds = new HashSet(); - var pIds = new HashSet(); - var f = new HashSet(); - var s = new HashSet(); + var wrong = new List(); var timer = new Stopwatch(); timer.Start(); - foreach (var chunk in Helper.GetChunksFromS3(_tmpFolder, vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket)) + foreach (var chunk in GetChunks(vendor, buildingIdToCheck)) { var chunkId = chunk.Key; - if (chunkIds != null && chunkIds.Any() && !chunkIds.Any(s => s == chunkId)) + if (chunkIdsToCheck != null && chunkIdsToCheck.Any() && !chunkIdsToCheck.Any(s => s == chunkId)) { Console.WriteLine("Skip chunkId " + chunkId); continue; } - #region var objects = new List(); from PERSON and METADATA_TMP - var objects = new List(); + var actualSlices = GetActualSlices(vendor.Name, buildingIdToCheck); - var slices2process = (slices == null || !slices.Any()) - ? Enumerable.Range(1, 100).ToList() - : slices.Distinct().OrderBy(s => s).ToList() + var slices2process = (slicesToCheck == null || !slicesToCheck.Any()) + ? actualSlices + .OrderBy(s => s) + .ToList() + : slicesToCheck + .Distinct() + .Where(s => actualSlices.Any(a => a == s)) + .OrderBy(s => s) + .ToList() ; - foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, - _cdmFolder, "PERSON", chunkId, slices2process)) + var objects = new List(); + foreach (var o in GetObjects(vendor, buildingIdToCheck, "PERSON", chunkId, slices2process)) { objects.AddRange(o); } - foreach (var o in Helper.GetObjectsFromS3(vendor, buildingId, _awsAccessKeyId, _awsSecretAccessKey, _bucket, - _cdmFolder, "METADATA_TMP", chunkId, slices2process, true)) + foreach (var o in GetObjects(vendor, buildingIdToCheck, "METADATA_TMP", chunkId, slices2process)) { objects.AddRange(o); } - #endregion - Helper.CheckChunk(_tmpFolder, objects, _awsAccessKeyId, _awsSecretAccessKey, _bucket, chunk); + if (objects.Count == 0) + { + wrong.Add($"chunkId={chunkId} - MISSED"); + } - int missedCnt = 0; - var missedPersonIds = new Dictionary(); + ProcessChunk(vendor, buildingIdToCheck, objects, chunk, slices2process, true); foreach (var c in chunk.Value) { if (c.Value.Count != 1) - { - missedCnt++; + wrong.Add($"chunkId={chunkId};person_id={c.Key};files={string.Join(',', [.. c.Value])}"); + } + } + + Console.WriteLine(); + timer.Stop(); + Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); + timer.Restart(); + } - if (missedCnt == 1 || missedCnt % 500 == 0) - missedPersonIds.Add(c.Key, false); + private HashSet GetActualSlices(string vendorName, int buildingId) + { + var slices = new HashSet(); + var prefix = $"{vendorName}/{buildingId}/{_cdmFolder}/PERSON/PERSON."; + Console.WriteLine("Calculating slices " + _bucket + "|" + prefix); + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = prefix + }; + ListObjectsV2Response response; + + do + { + var responseTask = client.ListObjectsV2Async(request); + responseTask.Wait(); + response = responseTask.Result; + + foreach (var o in response.S3Objects) + { + slices.Add(int.Parse(o.Key.Split('.')[1])); } - if (c.Value.Count != 1) + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated); + } + + Console.WriteLine("slices.Count=" + slices.Count); + + return slices; + } + + private IEnumerable>>> GetChunks(Vendor vendor, int buildingId) + { + var currentChunkId = 0; + var result = new KeyValuePair>>(0, []); + var prefix = $"{vendor}/{buildingId}/_chunks"; + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = prefix + }; + + var response = client.ListObjectsV2Async(request); + response.Wait(); + + foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) + { + using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(_bucket, o.Key); + using var bufferedStream = new BufferedStream(responseStream); + using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); + using var reader = new StreamReader(gzipStream, Encoding.Default); + string? line = reader.ReadLine(); + while (!string.IsNullOrEmpty(line)) { - wrong.Add($"chunkId={chunkId};person_id={c.Key};filese={string.Join(',', [.. c.Value])}"); - cIds.Add(chunkId); - pIds.Add(c.Key); + var chunkId = int.Parse(line.Split('\t')[0]); - foreach (var v in c.Value) + if (currentChunkId != chunkId) { - f.Add(v); - s.Add($@"done.Add(Process(vendor, buildingId, {chunkId}, ""{parseSliceIdFromKey(v):0000}"", true));"); + if (result.Value.Count > 0) + yield return result; + + result = new KeyValuePair>>(chunkId, + []); + currentChunkId = chunkId; } + + var personId = long.Parse(line.Split('\t')[1]); + result.Value.Add(personId, []); + + line = reader.ReadLine(); } } + } + + if (result.Value.Count > 0) + yield return result; + } + + private IEnumerable> GetObjects(Vendor vendor, int buildingId, string table, int chunkId, List slices) + { + var orderedSlices = slices.Distinct().OrderBy(s => s).ToList(); + for (int i = 0; i < orderedSlices.Count; i++) + { + var prefix = $"{vendor}/{buildingId}/{_cdmFolder}/{table}/{table}.{orderedSlices[i]}.{chunkId}."; + using var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = prefix + }; + ListObjectsV2Response response; + do + { + response = client.ListObjectsV2Async(request).GetAwaiter().GetResult(); + yield return response.S3Objects; + request.ContinuationToken = response.NextContinuationToken; + } + while (response.IsTruncated); + } + } + + private Dictionary ProcessChunk(Vendor vendor, int buildingId, List objects, KeyValuePair>> chunk, List slices, bool onlyCheck = true) + { + var attempt = 0; + var complete = false; + + while (!complete) + { + try + { + attempt++; + var missed = 0; + var dups = 0; + + foreach (var ci in chunk.Value) + { + ci.Value.Clear(); + } + + var timer = new Stopwatch(); + timer.Start(); + + var cnt = 0; + var attempt1 = attempt; + + Parallel.ForEach(objects, o => + { + using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(_bucket, o.Key); + using var bufferedStream = new BufferedStream(responseStream); + using Stream compressedStream = o.Key.EndsWith(".gz") + ? new GZipStream(bufferedStream, CompressionMode.Decompress) + : new DecompressionStream(bufferedStream) //.zst + ; + using var reader = new StreamReader(compressedStream, Encoding.Default); + using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = false, + Delimiter = ",", + Encoding = Encoding.UTF8 + }); + while (csv.Read()) + { + var personId = (long)csv.GetField(typeof(long), 0); + + chunk.Value[personId].Add(o.Key); + } + Interlocked.Increment(ref cnt); + }); + + int wrngCnt = 0; + var wrongPersonIds = new Dictionary(); + foreach (var ci in chunk.Value) + { + if (ci.Value.Count == 0) + { + missed++; + } + else if (ci.Value.Count > 1) + { + dups++; + } + + if (ci.Value.Count != 1) + { + wrngCnt++; - if (missedPersonIds.Count > 0) + if (wrngCnt == 1 || wrngCnt % 500 == 0) + wrongPersonIds.Add(ci.Key, false); + } + } + + timer.Stop(); + + if (missed > 0 || dups > 0) + { + Console.WriteLine($"XXX ChunkId={chunk.Key} | missed={missed}; dups={dups} | {wrongPersonIds.Keys.Count}"); + if (!onlyCheck) + { + Cleanup(vendor, buildingId, chunk.Key, slices); + var tasks = _lambdaUtility.TriggerBuildFunction(vendor, buildingId, chunk.Key, false); + Task.WaitAll([.. tasks]); + + var checkCreation = Task.Run(() => _lambdaUtility.AllChunksWereDone(vendor, buildingId, _lambdaUtility.BuildMessageBucket)); + + checkCreation.Wait(); + + foreach (var personId in chunk.Value.Keys) + { + chunk.Value[personId].Clear(); + } + + throw new Exception("restart"); + } + } + + complete = true; + return wrongPersonIds; + } + catch (Exception ex) { - foreach (var r in Helper.FindSlicesByPersonIds(_awsAccessKeyId, _awsSecretAccessKey, _bucket, vendor, buildingId, chunkId, vendor.PersonTableName, missedPersonIds, vendor.PersonIdIndex)) + Console.Write(ex.Message + " | [ProcessChunk] Exception | new attempt | attempt=" + attempt); + if (attempt > 3) { - missed.Add(r); - var fileName = r.Replace(@"\", "_").Replace(@"/", "_"); - File.Create($@"{_tmpFolder}\{fileName}.txt").Dispose(); - Console.WriteLine(fileName); + throw; } } + } + return null; + } + + private void Cleanup(Vendor vendor, int buildingId, int chunkId, List slices) + { + var tables = new[] + { + "PERSON", + "OBSERVATION_PERIOD", + "PAYER_PLAN_PERIOD", + "DEATH", + "DRUG_EXPOSURE", + "OBSERVATION", + "VISIT_OCCURRENCE", + "VISIT_DETAIL", + "PROCEDURE_OCCURRENCE", + "DRUG_ERA", + "CONDITION_ERA", + "DEVICE_EXPOSURE", + "MEASUREMENT", + "COHORT", + "CONDITION_OCCURRENCE", + "COST", + "NOTE", + "METADATA_TMP", + "FACT_RELATIONSHIP" + }; + + Console.WriteLine("Cleaning chunkId=" + chunkId); + + foreach (var table in tables) + { + Console.WriteLine("Cleaning table=" + table); + + for (var i = 0; i < slices.Count; i++) + { + Clean(vendor, buildingId, chunkId, table, slices[i]); + } + } + + Console.WriteLine($"chunkId={chunkId} was cleaned"); + } + + private IEnumerable GetLines(Stream stream, string filePath) + { + using var bufferedStream = new BufferedStream(stream); + using Stream compressedStream = filePath.EndsWith(".gz") + ? new GZipStream(bufferedStream, CompressionMode.Decompress) + : new DecompressionStream(bufferedStream) //.zst + ; + using var reader = new StreamReader(compressedStream, Encoding.Default); + string? line; + while ((line = reader.ReadLine()) != null) + { + if (!string.IsNullOrEmpty(line)) + { + yield return line; + } + } + } - Console.WriteLine(); - timer.Stop(); - Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); - timer.Restart(); + private List FindSlice(Vendor vendor, int buildingId, int chunkId, string table, Dictionary personIds, int personIndex) + { + var prefix = $"{vendor}/{buildingId}/raw/{chunkId}/{table}/{table}"; + + var result = new HashSet(); + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = prefix + }; + + var r = client.ListObjectsV2Async(request); + r.Wait(); + var response = r.Result; + var rows = new List(); + foreach (var o in response.S3Objects) + { + using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(_bucket, o.Key); + { + foreach (var line in GetLines(responseStream, o.Key)) + { + long personId = long.Parse(line.Split('\t')[personIndex]); + if (personIds.ContainsKey(personId)) + { + result.Add(o.Key); + break; + } + } + } + } } + + return [.. result]; } - int parseSliceIdFromKey(string key) + private void Clean(Vendor vendor, int buildingId, int chunkId, string table, int slice) { - var parts = key.Split(new[] { '.' }, StringSplitOptions.RemoveEmptyEntries); - // assuming sliceId is at a specific position in the parts array - if (parts.Length == 6 && int.TryParse(parts[1], out int sliceId)) + var attempt = 0; + var complete = false; + + while (!complete) { - return sliceId; + try + { + attempt++; + + var perfix = $"{vendor}/{buildingId}/{_cdmFolder}/{table}/{table}.{slice}.{chunkId}."; + + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = perfix + }; + ListObjectsV2Response response; + do + { + using var getListObjects = client.ListObjectsV2Async(request); + getListObjects.Wait(); + response = getListObjects.Result; + + var multiObjectDeleteRequest = new DeleteObjectsRequest + { + BucketName = _bucket + }; + + foreach (var o in response.S3Objects) + { + multiObjectDeleteRequest.AddKey(o.Key, null); + } + + if (response.S3Objects.Count > 0) + { + using var deleteObjects = client.DeleteObjectsAsync(multiObjectDeleteRequest); + deleteObjects.Wait(); + + //Console.WriteLine(response.S3Objects.Count + " files deleted"); + } + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated == true); + } + + complete = true; + } + catch (Exception ex) + { + Console.Write(" | [Clean] Exception | new attempt | " + attempt); + Console.WriteLine(ex.Message); + if (attempt > 3) + { + throw; + } + } } - throw new FormatException($"Invalid S3 object key format: {key}"); } #endregion From e99a75df8b7566f5e30d49be15842d1f60441f63 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Mon, 21 Oct 2024 19:12:28 +0200 Subject: [PATCH 07/37] Remove unreferenced methods --- sources/RunValidation/Program.cs | 1 + sources/RunValidation/Validation.cs | 57 ----------------------------- 2 files changed, 1 insertion(+), 57 deletions(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 98ee0752..53a04635 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -54,6 +54,7 @@ public static IEnumerable Examples static void Main(string[] args) { + //for some reason it returns more errors with partial scanning than with full scanning Parser.Default.ParseArguments(args) .WithParsed(RunWithOptions) .WithNotParsed(HandleParseError); diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 56171294..e1cf0c7d 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -100,7 +100,6 @@ private HashSet GetActualSlices(string vendorName, int buildingId) var prefix = $"{vendorName}/{buildingId}/{_cdmFolder}/PERSON/PERSON."; Console.WriteLine("Calculating slices " + _bucket + "|" + prefix); using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) - { var request = new ListObjectsV2Request { @@ -353,62 +352,6 @@ private void Cleanup(Vendor vendor, int buildingId, int chunkId, List slice Console.WriteLine($"chunkId={chunkId} was cleaned"); } - private IEnumerable GetLines(Stream stream, string filePath) - { - using var bufferedStream = new BufferedStream(stream); - using Stream compressedStream = filePath.EndsWith(".gz") - ? new GZipStream(bufferedStream, CompressionMode.Decompress) - : new DecompressionStream(bufferedStream) //.zst - ; - using var reader = new StreamReader(compressedStream, Encoding.Default); - string? line; - while ((line = reader.ReadLine()) != null) - { - if (!string.IsNullOrEmpty(line)) - { - yield return line; - } - } - } - - private List FindSlice(Vendor vendor, int buildingId, int chunkId, string table, Dictionary personIds, int personIndex) - { - var prefix = $"{vendor}/{buildingId}/raw/{chunkId}/{table}/{table}"; - - var result = new HashSet(); - using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = _bucket, - Prefix = prefix - }; - - var r = client.ListObjectsV2Async(request); - r.Wait(); - var response = r.Result; - var rows = new List(); - foreach (var o in response.S3Objects) - { - using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(_bucket, o.Key); - { - foreach (var line in GetLines(responseStream, o.Key)) - { - long personId = long.Parse(line.Split('\t')[personIndex]); - if (personIds.ContainsKey(personId)) - { - result.Add(o.Key); - break; - } - } - } - } - } - - return [.. result]; - } - private void Clean(Vendor vendor, int buildingId, int chunkId, string table, int slice) { var attempt = 0; From 39c9e5f1b3f832f6e3b8f559bb12ce11f3d11283 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 23 Oct 2024 17:09:10 +0200 Subject: [PATCH 08/37] Validation works. Change params to accept specific chunk:slice pairs, rather that their multiplication. Remove some obsolete and tesing code --- .../Helper.cs | 85 ---- sources/RunValidation/Helper.cs | 326 ------------- sources/RunValidation/Program.cs | 44 +- sources/RunValidation/Validation.cs | 374 ++++++--------- sources/RunValidation/Validation2.cs | 449 ------------------ 5 files changed, 174 insertions(+), 1104 deletions(-) delete mode 100644 sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Helper.cs delete mode 100644 sources/RunValidation/Helper.cs delete mode 100644 sources/RunValidation/Validation2.cs diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Helper.cs b/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Helper.cs deleted file mode 100644 index 382db171..00000000 --- a/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Helper.cs +++ /dev/null @@ -1,85 +0,0 @@ -using Amazon.S3; -using Amazon.S3.Model; -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; - -namespace org.ohdsi.cdm.presentation.lambdabuilder -{ - class Helper - { - public static IEnumerable GetFiles(string prefix) - { - prefix = prefix.Replace("\\", "/"); - - var request = new ListObjectsV2Request - { - BucketName = Settings.Current.Bucket, - Prefix = prefix - }; - - var config = new AmazonS3Config - { - Timeout = TimeSpan.FromMinutes(60), - RegionEndpoint = Amazon.RegionEndpoint.USEast1, - MaxErrorRetry = 20 - }; - - using var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, - Settings.Current.S3AwsSecretAccessKey, config); - using var listObjects = client.ListObjectsV2Async(request); - listObjects.Wait(); - - foreach (var entry in listObjects.Result.S3Objects) - { - if (!entry.Key.EndsWith(".xml")) - continue; - - yield return entry.Key; - } - } - - public static string S3ReadAllText(string key) - { - key = key.Replace("\\", "/"); - var config = new AmazonS3Config - { - Timeout = TimeSpan.FromMinutes(60), - RegionEndpoint = Amazon.RegionEndpoint.USEast1, - MaxErrorRetry = 20 - }; - - using var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, - Settings.Current.S3AwsSecretAccessKey, config); - var getObjectRequest = new GetObjectRequest - { - BucketName = Settings.Current.Bucket, - Key = key - }; - - try - { - var getObject = client.GetObjectAsync(getObjectRequest); - getObject.Wait(); - using var response = getObject.Result; - using var responseStream = response.ResponseStream; - using var reader = new StreamReader(responseStream, Encoding.Default); - var content = reader.ReadToEnd(); - return content; - - } - catch (Exception ex) - { - if (((Amazon.Runtime.AmazonServiceException)ex.InnerException).StatusCode == - System.Net.HttpStatusCode.NotFound) - { - Console.Write(" - not exists"); - return null; - } - - throw; - } - } - } -} diff --git a/sources/RunValidation/Helper.cs b/sources/RunValidation/Helper.cs deleted file mode 100644 index 51670047..00000000 --- a/sources/RunValidation/Helper.cs +++ /dev/null @@ -1,326 +0,0 @@ -using Amazon.S3; -using Amazon.S3.Model; -using Amazon.S3.Transfer; -using CsvHelper.Configuration; -using CsvHelper; -using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Diagnostics; -using System.Globalization; -using System.IO.Compression; -using System.IO; -using System.Linq; -using System.Text; -using System.Threading; -using System.Threading.Tasks; -using org.ohdsi.cdm.framework.common.Enums; -using org.ohdsi.cdm.presentation.lambdabuilder; -using ZstdSharp; -using Amazon.Runtime.Internal.Util; - -namespace RunValidation -{ - - internal class Helper - { - internal static void CheckChunk(string localTmpPath, List objects, string awsAccessKeyId, string awsSecretAccessKey, string bucket, - KeyValuePair>> chunk) - { - - var missed = 0; - var dups = 0; - - var attempt = 0; - var complete = false; - - var config = new AmazonS3Config - { - Timeout = TimeSpan.FromMinutes(60), - RegionEndpoint = Amazon.RegionEndpoint.USEast1, - BufferSize = 1512 * 1024, - MaxErrorRetry = 120 - }; - - while (!complete) - { - try - { - attempt++; - - var timer = new Stopwatch(); - timer.Start(); - - var cnt = 0; - var attempt1 = attempt; - - Parallel.ForEach(objects, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, o => - { - var loadAttempt = 0; - var loaded = false; - while (!loaded) - { - try - { - loadAttempt++; - using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, config)) - using (var transferUtility = new TransferUtility(client)) - { - transferUtility.Download($@"{localTmpPath}\{o.Key}", bucket, o.Key); - } - loaded = true; - } - catch (Exception) - { - if (loadAttempt <= 11) - { - Console.WriteLine(o.Key + " | " + loadAttempt); - } - else - { - throw; - } - } - } - - - using (var responseStream = File.Open($@"{localTmpPath}\{o.Key}", FileMode.Open)) - using (var bufferedStream = new BufferedStream(responseStream)) - using (var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress)) - using (var reader = new StreamReader(gzipStream, Encoding.Default)) - { - using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) - { - HasHeaderRecord = false, - Delimiter = ",", - Encoding = Encoding.UTF8 - }); - while (csv.Read()) - { - var personId = (long)csv.GetField(typeof(long), 0); - - chunk.Value[personId].Add(o.Key); - } - - Interlocked.Increment(ref cnt); - - Console.Write( - $"\rchunkId={chunk.Key} | {cnt} from {objects.Count} | attempt={attempt1}"); - } - - File.Delete($@"{localTmpPath}\{o.Key}"); - - }); - - - foreach (var ci in chunk.Value) - { - if (ci.Value.Count == 0) - { - missed++; - } - else if (ci.Value.Count > 1) - { - dups++; - } - } - - timer.Stop(); - Console.WriteLine($" | DONE | missed={missed}; dups={dups} | total={timer.ElapsedMilliseconds}ms"); - - complete = true; - } - catch (Exception) - { - Console.Write(" | Exception"); - if (attempt > 3) - { - throw; - } - } - } - } - - internal static IEnumerable> GetObjectsFromS3(Vendor vendor, int buildingId, string awsAccessKeyId, string awsSecretAccessKey, - string bucket, string cdmFolder, string table, int chunkId, IEnumerable slices, bool skipObjectCountChecking = false) - { - var orderedSlices = slices.OrderBy(s => s).ToList(); - bool stop = false; - for (int i = 0; i < orderedSlices.Count; i++) - { - if (stop) - break; - - var prefix = $"{vendor}/{buildingId}/{cdmFolder}/{table}/{table}.{orderedSlices[i]}.{chunkId}."; - - using var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, - Amazon.RegionEndpoint.USEast1); - var request = new ListObjectsV2Request - { - BucketName = bucket, - Prefix = prefix - }; - ListObjectsV2Response response; - do - { - var responseTask = client.ListObjectsV2Async(request); - responseTask.Wait(); - response = responseTask.Result; - - yield return response.S3Objects; - - request.ContinuationToken = response.NextContinuationToken; - - //assume that all objects are not sparse, so no other object is expected with number more that the previous one - if (!skipObjectCountChecking && response.S3Objects.Count == 0) - { - stop = true; - break; - } - } while (response.IsTruncated); - } - - } - - internal static List FindSlicesByPersonIds(string awsAccessKeyId, string awsSecretAccessKey, string bucket, Vendor vendor, int buildingId, int chunkId, string table, Dictionary personIds, int personIndex) - { - var prefix = $"{vendor}/{buildingId}/raw/{chunkId}/{table}/{table}"; - - var input = new ConcurrentDictionary(); - - foreach (var pId in personIds.Keys) - { - input.TryAdd(pId, false); - } - - var result = new ConcurrentDictionary(); - using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = bucket, - Prefix = prefix - }; - - var r = client.ListObjectsV2Async(request); - r.Wait(); - var response = r.Result; - var rows = new List(); - - Parallel.ForEach(response.S3Objects, o => - { - using var transferUtility = new TransferUtility(awsAccessKeyId, awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); - using Stream stream = o.Key.EndsWith(".gz") - ? new GZipStream(bufferedStream, CompressionMode.Decompress) - : new DecompressionStream(bufferedStream) //.zst - ; - using var reader = new StreamReader(stream, Encoding.Default); - string? line = reader.ReadLine(); - while (!string.IsNullOrEmpty(line)) - { - if (input.IsEmpty) - break; - - long personId = long.Parse(line.Split('\t')[personIndex]); - if (personIds.ContainsKey(personId)) - { - result.TryAdd(o.Key, false); - input.TryRemove(personId, out var res); - break; - } - line = reader.ReadLine(); - } - }); - } - - return [.. result.Keys]; - } - - internal static IEnumerable>>> GetChunksFromS3(string localTmpPath, Vendor vendor, int buildingId, - string awsAccessKeyId, string awsSecretAccessKey, - string bucket) - { - var currentChunkId = 0; - var result = new KeyValuePair>>(0, []); - var prefix = $"{vendor}/{buildingId}/_chunks"; - using (var client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = bucket, - Prefix = prefix - }; - - var response = client.ListObjectsV2Async(request); - response.Wait(); - - foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) - { - var loadAttempt = 0; - var loaded = false; - while (!loaded) - { - try - { - loadAttempt++; - using (var transferUtility = new TransferUtility(client)) - { - transferUtility.Download($@"{localTmpPath}\{o.Key}", bucket, o.Key); - } - loaded = true; - } - catch (Exception) - { - if (loadAttempt <= 11) - { - Console.WriteLine(o.Key + " | " + loadAttempt); - } - else - { - throw; - } - } - } - - using (var responseStream = File.Open($@"{localTmpPath}\{o.Key}", FileMode.Open)) - using (var bufferedStream = new BufferedStream(responseStream)) - using (var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress)) - using (var reader = new StreamReader(gzipStream, Encoding.Default)) - { - string line; - while ((line = reader.ReadLine()) != null) - { - if (!string.IsNullOrEmpty(line)) - { - var chunkId = int.Parse(line.Split('\t')[0]); - - if (currentChunkId != chunkId) - { - if (result.Value.Count > 0) - yield return result; - - result = new KeyValuePair>>(chunkId, - []); - currentChunkId = chunkId; - } - - var personId = long.Parse(line.Split('\t')[1]); - result.Value.Add(personId, []); - } - } - } - - File.Delete($@"{localTmpPath}\{o.Key}"); - } - } - - if (result.Value.Count > 0) - yield return result; - } - } - -} \ No newline at end of file diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 53a04635..b13b0b0a 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -5,6 +5,7 @@ using org.ohdsi.cdm.framework.common.Utility; using System.Configuration; using System.ComponentModel.Design; +using System.Linq; namespace RunValidation { @@ -25,10 +26,7 @@ internal class Options public string LocalTmpPath { get; set; } = "C:\\_tmp"; [Option('c', "chunks", Separator = ',', HelpText = "(Optional) Comma-separated list of chunk IDs to process. All of them, if omitted.")] - public IEnumerable Chunks { get; set; } = new List(); - - [Option('s', "slices", Separator = ',', HelpText = "(Optional) Comma-separated list of slice IDs to process for each chunkId. 100, if omitted.")] - public IEnumerable Slices { get; set; } = new List(); + public IEnumerable ChunkSlicePairs { get; set; } = new List(); [Usage(ApplicationAlias = "RunValidation")] public static IEnumerable Examples @@ -40,9 +38,9 @@ public static IEnumerable Examples yield return new Example("Process all chunks for an external .dll", new Options { Vendor = "ExternalVendorName", BuildingId = 123, EtlLibraryPath = "C:\\PathToExternalDllFolder"}); yield return new Example("Process specified chunks", new Options - { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1, 2, 3 } }); - yield return new Example("Process specified slices for specified chunks", new Options - { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1, 2, 3 }, Slices = new List { 1, 2, 3 } }); + { Vendor = "VendorName", BuildingId = 123, ChunkSlicePairs = new List { "1", "2", "3" } }); + yield return new Example("Process specified pairs chunk:slice. If : omitted, then process all slices for a given chunk", new Options + { Vendor = "VendorName", BuildingId = 123, ChunkSlicePairs = new List { "1", "2:1", "2:2", "3" } }); } } } @@ -54,7 +52,6 @@ public static IEnumerable Examples static void Main(string[] args) { - //for some reason it returns more errors with partial scanning than with full scanning Parser.Default.ParseArguments(args) .WithParsed(RunWithOptions) .WithNotParsed(HandleParseError); @@ -64,40 +61,45 @@ static void Main(string[] args) static void RunWithOptions(Options opts) { - var chunks = opts.Chunks ?? Enumerable.Empty(); - var slices = opts.Slices ?? Enumerable.Empty(); + var chunkSlicePairs = new List<(int Chunk, int? Slice)>(); + foreach (var raw in opts.ChunkSlicePairs) + { + var parts = raw.Replace(" ", "").Split(':'); + + int chunkId = int.Parse(parts[0]); + + int? sliceId = null; + if (parts.Length > 1 && int.TryParse(parts[1], out int sliceIdTmp)) + sliceId = sliceIdTmp; + + chunkSlicePairs.Add((chunkId, sliceId)); + } + var chunkSlicePairsStrings = chunkSlicePairs.Select(s => s.Chunk + (s.Slice.HasValue ? ":" + s.Slice : "")).ToList(); Console.WriteLine("Options:"); - Console.WriteLine($"Keys: {_awsAccessKeyId} - {_awsSecretAccessKey}"); + //Console.WriteLine($"Keys: {_awsAccessKeyId} - {_awsSecretAccessKey}"); Console.WriteLine($"Bucket - folder: {_bucket} - {_cdmFolder}"); Console.WriteLine($"Vendor: {opts.Vendor}"); Console.WriteLine($"Building ID: {opts.BuildingId}"); Console.WriteLine($"EtlLibraryPath: {opts.EtlLibraryPath}"); Console.WriteLine($"LocalTmpPath: {opts.LocalTmpPath}"); - Console.WriteLine($"Chunks: {string.Join(", ", chunks)}"); - Console.WriteLine($"Slices: {string.Join(", ", slices)}"); + Console.WriteLine($"ChunkSlicePairs: {string.Join(", ", chunkSlicePairsStrings)}"); Console.WriteLine($"Current directory: {Directory.GetCurrentDirectory()}"); Console.WriteLine(); - //int[] slicesNum = [24, 40, 48, 96, 192]; - - //var localTmpPath = "C:\\_tmp"; - //var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, localTmpPath); - //validation.Start((Vendor)Enum.Parse(typeof(Vendor), args[0]), int.Parse(args[1]), slicesNum[0], _cdmFolder); Vendor vendor = EtlLibrary.CreateVendorInstance(opts.EtlLibraryPath, opts.Vendor); var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, opts.LocalTmpPath, _cdmFolder); - validation.ValidateBuildingId(vendor, opts.BuildingId, chunks, slices); + validation.ValidateBuildingId(vendor, opts.BuildingId, chunkSlicePairs); } static void HandleParseError(IEnumerable errs) { - // Handle errors Console.WriteLine("Failed to parse command-line arguments."); foreach (var error in errs) { Console.WriteLine(error.ToString()); - } + } } } } diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index e1cf0c7d..956faaa8 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -15,6 +15,9 @@ using System.Runtime.InteropServices; using System.Text; using ZstdSharp; +using org.ohdsi.cdm.framework.desktop.DbLayer; +using Microsoft.VisualBasic; +using CsvHelper.Configuration.Attributes; namespace RunValidation { @@ -31,101 +34,62 @@ public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string private readonly LambdaUtility _lambdaUtility = new LambdaUtility(awsAccessKeyId, awsSecretAccessKey, awsAccessKeyId, awsSecretAccessKey, bucket, bucket, bucket, cdmFolder); + #endregion #region Methods - public void ValidateBuildingId(Vendor vendor, int buildingIdToCheck, IEnumerable chunkIdsToCheck, IEnumerable slicesToCheck) + public void ValidateBuildingId(Vendor vendor, int buildingId, List<(int ChunkId, int? SliceId)> chunkSlicePairs) { - var wrong = new List(); + var _wrong = new List(); var timer = new Stopwatch(); timer.Start(); - foreach (var chunk in GetChunks(vendor, buildingIdToCheck)) + foreach (var awsChunk in GetChunks(vendor, buildingId)) { - var chunkId = chunk.Key; - if (chunkIdsToCheck != null && chunkIdsToCheck.Any() && !chunkIdsToCheck.Any(s => s == chunkId)) - { - Console.WriteLine("Skip chunkId " + chunkId); - continue; - } + var awsChunkId = awsChunk.Key; - var actualSlices = GetActualSlices(vendor.Name, buildingIdToCheck); - - var slices2process = (slicesToCheck == null || !slicesToCheck.Any()) - ? actualSlices - .OrderBy(s => s) - .ToList() - : slicesToCheck - .Distinct() - .Where(s => actualSlices.Any(a => a == s)) - .OrderBy(s => s) - .ToList() - ; - - var objects = new List(); - foreach (var o in GetObjects(vendor, buildingIdToCheck, "PERSON", chunkId, slices2process)) + if (chunkSlicePairs.Any() && !chunkSlicePairs.Any(s => s.ChunkId == awsChunkId)) { - objects.AddRange(o); - } - - foreach (var o in GetObjects(vendor, buildingIdToCheck, "METADATA_TMP", chunkId, slices2process)) - { - objects.AddRange(o); - } - - if (objects.Count == 0) - { - wrong.Add($"chunkId={chunkId} - MISSED"); + Console.WriteLine("Skip chunkId " + awsChunkId); + continue; } - ProcessChunk(vendor, buildingIdToCheck, objects, chunk, slices2process, true); - - foreach (var c in chunk.Value) - { - if (c.Value.Count != 1) - wrong.Add($"chunkId={chunkId};person_id={c.Key};files={string.Join(',', [.. c.Value])}"); - } + var slices = chunkSlicePairs + .Where(s => s.ChunkId == awsChunkId) + .Where(s => s.SliceId != null) + .Select(s => s.SliceId ?? -1) //change type int? to int + .ToList(); + + ValidateChunkId(vendor, buildingId, awsChunkId, slices); } Console.WriteLine(); timer.Stop(); - Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); + Console.WriteLine($"Done. Total seconds={timer.ElapsedMilliseconds/1000}s"); timer.Restart(); } - private HashSet GetActualSlices(string vendorName, int buildingId) + private void ValidateChunkId(Vendor vendor, int buildingId, int chunkId, IEnumerable slices) { - var slices = new HashSet(); - var prefix = $"{vendorName}/{buildingId}/{_cdmFolder}/PERSON/PERSON."; - Console.WriteLine("Calculating slices " + _bucket + "|" + prefix); - using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + var actualSlices = GetActualSlices(vendor.Name, buildingId); + var slices2process = (!slices.Any()) + ? actualSlices + .OrderBy(s => s) + .ToList() + : slices + .Distinct() + .Where(s => actualSlices.Any(a => a == s)) + .OrderBy(s => s) + .ToList() + ; + + var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices2process); + + Parallel.ForEach(s3ObjectsBySlice, slice => { - var request = new ListObjectsV2Request - { - BucketName = _bucket, - Prefix = prefix - }; - ListObjectsV2Response response; - - do - { - var responseTask = client.ListObjectsV2Async(request); - responseTask.Wait(); - response = responseTask.Result; - - foreach (var o in response.S3Objects) - { - slices.Add(int.Parse(o.Key.Split('.')[1])); - } - - request.ContinuationToken = response.NextContinuationToken; - } while (response.IsTruncated); - } - - Console.WriteLine("slices.Count=" + slices.Count); - - return slices; + ValidateSliceId(vendor, buildingId, chunkId, slice.Key, slice.Value.personObjects, slice.Value.metadataObjects); + }); } private IEnumerable>>> GetChunks(Vendor vendor, int buildingId) @@ -178,7 +142,78 @@ private IEnumerable>>> GetChunks yield return result; } - private IEnumerable> GetObjects(Vendor vendor, int buildingId, string table, int chunkId, List slices) + private HashSet GetActualSlices(string vendorName, int buildingId) + { + var slices = new HashSet(); + var prefix = $"{vendorName}/{buildingId}/{_cdmFolder}/PERSON/PERSON."; + Console.WriteLine("Calculating slices " + _bucket + "|" + prefix); + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = prefix + }; + ListObjectsV2Response response; + + do + { + var responseTask = client.ListObjectsV2Async(request); + responseTask.Wait(); + response = responseTask.Result; + + foreach (var o in response.S3Objects) + { + slices.Add(int.Parse(o.Key.Split('.')[1])); + } + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated); + } + + Console.WriteLine("slices.Count=" + slices.Count); + Console.WriteLine(); + + return slices; + } + + private Dictionary personObjects, List metadataObjects)> GetS3ObjectsBySlice(Vendor vendor, + int buildingId, int chunkId, List slices2process) + { + var s3ObjectsBySlice = new Dictionary PersonObjects, List MetadataObjects)>(); + + foreach (var tuple in GetObjects(vendor, buildingId, "PERSON", chunkId, slices2process)) + { + int sliceId = tuple.Item1; + List personObjects = tuple.Item2; + + if (!s3ObjectsBySlice.ContainsKey(sliceId)) + s3ObjectsBySlice[sliceId] = (new List(), new List()); + + s3ObjectsBySlice[sliceId].PersonObjects.AddRange(personObjects); + } + + foreach (var tuple in GetObjects(vendor, buildingId, "METADATA_TMP", chunkId, slices2process)) + { + int sliceId = tuple.Item1; + List metadataObjects = tuple.Item2; + + if (!s3ObjectsBySlice.ContainsKey(sliceId)) + s3ObjectsBySlice[sliceId] = (new List(), new List()); + + s3ObjectsBySlice[sliceId].MetadataObjects.AddRange(metadataObjects); + } + + if (s3ObjectsBySlice.Count == 0) + { + var msg = $"chunkId={chunkId} - MISSED"; + Console.WriteLine(msg); + } + + return s3ObjectsBySlice; + } + + private IEnumerable>> GetObjects(Vendor vendor, int buildingId, string table, int chunkId, List slices) { var orderedSlices = slices.Distinct().OrderBy(s => s).ToList(); for (int i = 0; i < orderedSlices.Count; i++) @@ -194,15 +229,15 @@ private IEnumerable> GetObjects(Vendor vendor, int buildingId, st do { response = client.ListObjectsV2Async(request).GetAwaiter().GetResult(); - yield return response.S3Objects; + yield return Tuple.Create(orderedSlices[i], response.S3Objects); request.ContinuationToken = response.NextContinuationToken; } while (response.IsTruncated); } } - private Dictionary ProcessChunk(Vendor vendor, int buildingId, List objects, KeyValuePair>> chunk, List slices, bool onlyCheck = true) + private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId, int sliceId, + List personObjects, List metadataObjects) { var attempt = 0; var complete = false; @@ -212,21 +247,19 @@ private Dictionary ProcessChunk(Vendor vendor, int buildingId, List< try { attempt++; - var missed = 0; - var dups = 0; - - foreach (var ci in chunk.Value) - { - ci.Value.Clear(); - } + var appearenceStatsByPersonId = new Dictionary(); var timer = new Stopwatch(); timer.Start(); + #region get personAppearenceStats + var cnt = 0; var attempt1 = attempt; - Parallel.ForEach(objects, o => + var allObjects = personObjects.Union(metadataObjects).ToList(); + + Parallel.ForEach(allObjects, o => { using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); using var responseStream = transferUtility.OpenStream(_bucket, o.Key); @@ -245,56 +278,54 @@ private Dictionary ProcessChunk(Vendor vendor, int buildingId, List< while (csv.Read()) { var personId = (long)csv.GetField(typeof(long), 0); + lock (appearenceStatsByPersonId) + { + if (!appearenceStatsByPersonId.ContainsKey(personId)) + appearenceStatsByPersonId[personId] = (0, 0); + + var tuple = appearenceStatsByPersonId[personId]; - chunk.Value[personId].Add(o.Key); + if (o.Key.Contains("PERSON")) + tuple.InPersonCount++; + else if (o.Key.Contains("METADATA_TMP")) + tuple.InMetadataCount++; + + appearenceStatsByPersonId[personId] = tuple; + } } Interlocked.Increment(ref cnt); }); - int wrngCnt = 0; - var wrongPersonIds = new Dictionary(); - foreach (var ci in chunk.Value) + #endregion + + int wrongCount = 0; + var dups = 0; + var wrongPersonIds = new HashSet(); + + foreach (var kvp in appearenceStatsByPersonId) { - if (ci.Value.Count == 0) - { - missed++; - } - else if (ci.Value.Count > 1) - { - dups++; - } + var personId = kvp.Key; + var stats = kvp.Value; - if (ci.Value.Count != 1) + //check InPersonCount just in case, InMetadataCount should actually suffice + if (stats.InPersonCount > 1 || stats.InMetadataCount > 0) { - wrngCnt++; + wrongCount++; + + if (stats.InPersonCount > 1 || stats.InMetadataCount > 1) + dups++; - if (wrngCnt == 1 || wrngCnt % 500 == 0) - wrongPersonIds.Add(ci.Key, false); + if(!wrongPersonIds.Contains(personId)) + wrongPersonIds.Add(personId); } } timer.Stop(); - if (missed > 0 || dups > 0) + if (wrongCount > 0) { - Console.WriteLine($"XXX ChunkId={chunk.Key} | missed={missed}; dups={dups} | {wrongPersonIds.Keys.Count}"); - if (!onlyCheck) - { - Cleanup(vendor, buildingId, chunk.Key, slices); - var tasks = _lambdaUtility.TriggerBuildFunction(vendor, buildingId, chunk.Key, false); - Task.WaitAll([.. tasks]); - - var checkCreation = Task.Run(() => _lambdaUtility.AllChunksWereDone(vendor, buildingId, _lambdaUtility.BuildMessageBucket)); - - checkCreation.Wait(); - - foreach (var personId in chunk.Value.Keys) - { - chunk.Value[personId].Clear(); - } - - throw new Exception("restart"); - } + var msg = $"BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} | WrongCount={wrongCount}; Duplicates={dups} | Wrong Person Id Example={wrongPersonIds.First()}"; + Console.WriteLine(msg); } complete = true; @@ -312,109 +343,6 @@ private Dictionary ProcessChunk(Vendor vendor, int buildingId, List< return null; } - private void Cleanup(Vendor vendor, int buildingId, int chunkId, List slices) - { - var tables = new[] - { - "PERSON", - "OBSERVATION_PERIOD", - "PAYER_PLAN_PERIOD", - "DEATH", - "DRUG_EXPOSURE", - "OBSERVATION", - "VISIT_OCCURRENCE", - "VISIT_DETAIL", - "PROCEDURE_OCCURRENCE", - "DRUG_ERA", - "CONDITION_ERA", - "DEVICE_EXPOSURE", - "MEASUREMENT", - "COHORT", - "CONDITION_OCCURRENCE", - "COST", - "NOTE", - "METADATA_TMP", - "FACT_RELATIONSHIP" - }; - - Console.WriteLine("Cleaning chunkId=" + chunkId); - - foreach (var table in tables) - { - Console.WriteLine("Cleaning table=" + table); - - for (var i = 0; i < slices.Count; i++) - { - Clean(vendor, buildingId, chunkId, table, slices[i]); - } - } - - Console.WriteLine($"chunkId={chunkId} was cleaned"); - } - - private void Clean(Vendor vendor, int buildingId, int chunkId, string table, int slice) - { - var attempt = 0; - var complete = false; - - while (!complete) - { - try - { - attempt++; - - var perfix = $"{vendor}/{buildingId}/{_cdmFolder}/{table}/{table}.{slice}.{chunkId}."; - - using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = _bucket, - Prefix = perfix - }; - ListObjectsV2Response response; - do - { - using var getListObjects = client.ListObjectsV2Async(request); - getListObjects.Wait(); - response = getListObjects.Result; - - var multiObjectDeleteRequest = new DeleteObjectsRequest - { - BucketName = _bucket - }; - - foreach (var o in response.S3Objects) - { - multiObjectDeleteRequest.AddKey(o.Key, null); - } - - if (response.S3Objects.Count > 0) - { - using var deleteObjects = client.DeleteObjectsAsync(multiObjectDeleteRequest); - deleteObjects.Wait(); - - //Console.WriteLine(response.S3Objects.Count + " files deleted"); - } - - request.ContinuationToken = response.NextContinuationToken; - } while (response.IsTruncated == true); - } - - complete = true; - } - catch (Exception ex) - { - Console.Write(" | [Clean] Exception | new attempt | " + attempt); - Console.WriteLine(ex.Message); - if (attempt > 3) - { - throw; - } - } - } - } - #endregion } diff --git a/sources/RunValidation/Validation2.cs b/sources/RunValidation/Validation2.cs deleted file mode 100644 index c9052456..00000000 --- a/sources/RunValidation/Validation2.cs +++ /dev/null @@ -1,449 +0,0 @@ -using Amazon.S3; -using Amazon.S3.Model; -using Amazon.S3.Transfer; -using CsvHelper; -using CsvHelper.Configuration; -using org.ohdsi.cdm.framework.common.Helpers; -using org.ohdsi.cdm.framework.desktop.Settings; -using System.Diagnostics; -using System.Globalization; -using System.IO.Compression; -using System.Text; - -namespace RunValidation -{ - [Obsolete] //it's here for a reference; //todo delete //copy from Presentation - public class Validation2 - { - private string _cdmFolder = "cdmCSV"; - private LambdaUtility _lambdaUtility; - - public void Start(LambdaUtility utility, string cdmCsvFolder) - { - var wrong = new List(); - var timer = new Stopwatch(); - timer.Start(); - _cdmFolder = cdmCsvFolder; - - var slicesNum = GetSlicesNum(); - _lambdaUtility = utility; - - foreach (var chunk in GetChunk()) - { - var chunkId = chunk.Key; - var objects = new List(); - foreach (var o in GetObjects("PERSON", chunkId, slicesNum)) - { - objects.AddRange(o); - } - - foreach (var o in GetObjects("METADATA_TMP", chunkId, slicesNum)) - { - objects.AddRange(o); - } - - if (objects.Count == 0) - { - wrong.Add($"chunkId={chunkId} - MISSED"); - } - - ProcessChunk(objects, chunk, slicesNum, false); - - foreach (var c in chunk.Value) - { - if (c.Value.Count != 1) - wrong.Add($"chunkId={chunkId};person_id={c.Key};files={string.Join(',', [.. c.Value])}"); - } - } - - Console.WriteLine(); - timer.Stop(); - Console.WriteLine($"Total={timer.ElapsedMilliseconds}ms"); - timer.Restart(); - } - - private int GetSlicesNum() - { - var slices = new HashSet(); - var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/{_cdmFolder}/PERSON/PERSON."; - Console.WriteLine("Calculating slices num " + Settings.Current.Bucket + "|" + prefix); - using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - - { - var request = new ListObjectsV2Request - { - BucketName = Settings.Current.Bucket, - Prefix = prefix - }; - ListObjectsV2Response response; - - do - { - var responseTask = client.ListObjectsV2Async(request); - responseTask.Wait(); - response = responseTask.Result; - - foreach (var o in response.S3Objects) - { - slices.Add(o.Key.Split('.')[1]); - } - - request.ContinuationToken = response.NextContinuationToken; - } while (response.IsTruncated); - } - - Console.WriteLine("slices.Count=" + slices.Count); - - return slices.Count; - } - - private static IEnumerable>>> GetChunk() - { - var currentChunkId = 0; - var result = new KeyValuePair>>(0, []); - var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/_chunks"; - using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = Settings.Current.Bucket, - Prefix = prefix - }; - - var response = client.ListObjectsV2Async(request); - response.Wait(); - - foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) - { - using var transferUtility = new TransferUtility(Settings.Current.S3AwsAccessKeyId, - Settings.Current.S3AwsSecretAccessKey, Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(Settings.Current.Bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); - using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); - using var reader = new StreamReader(gzipStream, Encoding.Default); - string line; - while ((line = reader.ReadLine()) != null) - { - if (!string.IsNullOrEmpty(line)) - { - var chunkId = int.Parse(line.Split('\t')[0]); - - if (currentChunkId != chunkId) - { - if (result.Value.Count > 0) - yield return result; - - result = new KeyValuePair>>(chunkId, - []); - currentChunkId = chunkId; - } - - var personId = long.Parse(line.Split('\t')[1]); - result.Value.Add(personId, []); - } - } - } - } - - if (result.Value.Count > 0) - yield return result; - } - - public IEnumerable> GetObjects(string table, int chunkId, int slicesNum) - { - for (int i = 0; i < slicesNum; i++) - { - var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/{_cdmFolder}/{table}/{table}.{i}.{chunkId}."; - using var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, - Amazon.RegionEndpoint.USEast1); - var request = new ListObjectsV2Request - { - BucketName = Settings.Current.Bucket, - Prefix = prefix - }; - ListObjectsV2Response response; - do - { - var responseTask = client.ListObjectsV2Async(request); - responseTask.Wait(); - response = responseTask.Result; - - yield return response.S3Objects; - - request.ContinuationToken = response.NextContinuationToken; - } while (response.IsTruncated); - } - - } - - public Dictionary ProcessChunk(List objects, KeyValuePair>> chunk, int slicesNum, bool onlyCheck) - { - var attempt = 0; - var complete = false; - - while (!complete) - { - try - { - attempt++; - var missed = 0; - var dups = 0; - - foreach (var ci in chunk.Value) - { - ci.Value.Clear(); - } - - var timer = new Stopwatch(); - timer.Start(); - - var cnt = 0; - var attempt1 = attempt; - - Parallel.ForEach(objects, o => - { - using var transferUtility = new TransferUtility(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, - Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(Settings.Current.Bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); - using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); - using var reader = new StreamReader(gzipStream, Encoding.Default); - using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) - { - HasHeaderRecord = false, - Delimiter = ",", - Encoding = Encoding.UTF8 - }); - while (csv.Read()) - { - var personId = (long)csv.GetField(typeof(long), 0); - - chunk.Value[personId].Add(o.Key); - } - Interlocked.Increment(ref cnt); - }); - - int wrngCnt = 0; - var wrongPersonIds = new Dictionary(); - foreach (var ci in chunk.Value) - { - if (ci.Value.Count == 0) - { - missed++; - } - else if (ci.Value.Count > 1) - { - dups++; - } - - if (ci.Value.Count != 1) - { - wrngCnt++; - - if (wrngCnt == 1 || wrngCnt % 500 == 0) - wrongPersonIds.Add(ci.Key, false); - } - } - - timer.Stop(); - - if (missed > 0 || dups > 0) - { - Console.WriteLine($"XXX ChunkId={chunk.Key} | missed={missed}; dups={dups} | {wrongPersonIds.Keys.Count}"); - if (!onlyCheck) - { - Cleanup(chunk.Key, slicesNum); - var tasks = _lambdaUtility.TriggerBuildFunction(Settings.Current.Building.Vendor, Settings.Current.Building.Id.Value, chunk.Key, false); - Task.WaitAll([.. tasks]); - - var checkCreation = Task.Run(() => _lambdaUtility.AllChunksWereDone(Settings.Current.Building.Vendor, - Settings.Current.Building.Id.Value, _lambdaUtility.BuildMessageBucket)); - - checkCreation.Wait(); - - foreach (var personId in chunk.Value.Keys) - { - chunk.Value[personId].Clear(); - } - - throw new Exception("restart"); - } - } - - complete = true; - return wrongPersonIds; - } - catch (Exception ex) - { - Console.Write(ex.Message + " | [ProcessChunk] Exception | new attempt | attempt=" + attempt); - if (attempt > 3) - { - throw; - } - } - } - return null; - } - - private void Cleanup(int chunkId, int slicesNum) - { - var tables = new[] - { - "PERSON", - "OBSERVATION_PERIOD", - "PAYER_PLAN_PERIOD", - "DEATH", - "DRUG_EXPOSURE", - "OBSERVATION", - "VISIT_OCCURRENCE", - "VISIT_DETAIL", - "PROCEDURE_OCCURRENCE", - "DRUG_ERA", - "CONDITION_ERA", - "DEVICE_EXPOSURE", - "MEASUREMENT", - "COHORT", - "CONDITION_OCCURRENCE", - "COST", - "NOTE", - "METADATA_TMP", - "FACT_RELATIONSHIP" - }; - - Console.WriteLine("Cleaning chunkId=" + chunkId); - - foreach (var table in tables) - { - Console.WriteLine("Cleaning table=" + table); - - for (var i = 0; i < slicesNum; i++) - { - Clean(chunkId, table, i); - } - } - - Console.WriteLine($"chunkId={chunkId} was cleaned"); - } - - private static IEnumerable GetLines(Stream stream) - { - using var bufferedStream = new BufferedStream(stream); - using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); - using var reader = new StreamReader(gzipStream, Encoding.Default); - string line; - while ((line = reader.ReadLine()) != null) - { - if (!string.IsNullOrEmpty(line)) - { - yield return line; - } - } - } - - public static List FindSlice(int chunkId, string table, Dictionary personIds, int personIndex) - { - var prefix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/raw/{chunkId}/{table}/{table}"; - - var result = new HashSet(); - using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = Settings.Current.Bucket, - Prefix = prefix - }; - - var r = client.ListObjectsV2Async(request); - r.Wait(); - var response = r.Result; - var rows = new List(); - foreach (var o in response.S3Objects) - { - using var transferUtility = new TransferUtility(Settings.Current.S3AwsAccessKeyId, - Settings.Current.S3AwsSecretAccessKey, Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(Settings.Current.Bucket, o.Key); - { - foreach (var line in GetLines(responseStream)) - { - long personId = long.Parse(line.Split('\t')[personIndex]); - if (personIds.ContainsKey(personId)) - { - result.Add(o.Key); - break; - } - } - } - } - } - - return [.. result]; - } - - public void Clean(int chunkId, string table, int slice) - { - var attempt = 0; - var complete = false; - - while (!complete) - { - try - { - attempt++; - - var perfix = $"{Settings.Current.Building.Vendor}/{Settings.Current.Building.Id.Value}/{_cdmFolder}/{table}/{table}.{slice}.{chunkId}."; - - using (var client = new AmazonS3Client(Settings.Current.S3AwsAccessKeyId, Settings.Current.S3AwsSecretAccessKey, - Amazon.RegionEndpoint.USEast1)) - { - var request = new ListObjectsV2Request - { - BucketName = Settings.Current.Bucket, - Prefix = perfix - }; - ListObjectsV2Response response; - do - { - using var getListObjects = client.ListObjectsV2Async(request); - getListObjects.Wait(); - response = getListObjects.Result; - - var multiObjectDeleteRequest = new DeleteObjectsRequest - { - BucketName = Settings.Current.Bucket - }; - - foreach (var o in response.S3Objects) - { - multiObjectDeleteRequest.AddKey(o.Key, null); - } - - if (response.S3Objects.Count > 0) - { - using var deleteObjects = client.DeleteObjectsAsync(multiObjectDeleteRequest); - deleteObjects.Wait(); - - //Console.WriteLine(response.S3Objects.Count + " files deleted"); - } - - request.ContinuationToken = response.NextContinuationToken; - } while (response.IsTruncated == true); - } - - complete = true; - } - catch (Exception ex) - { - Console.Write(" | [Clean] Exception | new attempt | " + attempt); - Console.WriteLine(ex.Message); - if (attempt > 3) - { - throw; - } - } - } - } - } -} \ No newline at end of file From d08cd5987cbce5cdd60624d6ced5be78a3be2a5e Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 23 Oct 2024 19:13:06 +0200 Subject: [PATCH 09/37] Clean usings --- sources/RunValidation/Program.cs | 5 +---- sources/RunValidation/Validation.cs | 8 -------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index b13b0b0a..21fc2d0f 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -1,11 +1,8 @@ -using Amazon.S3.Model; -using CommandLine.Text; +using CommandLine.Text; using CommandLine; using org.ohdsi.cdm.framework.common.Enums; using org.ohdsi.cdm.framework.common.Utility; using System.Configuration; -using System.ComponentModel.Design; -using System.Linq; namespace RunValidation { diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 956faaa8..1bd209ad 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -5,19 +5,11 @@ using CsvHelper; using org.ohdsi.cdm.framework.common.Enums; using org.ohdsi.cdm.framework.common.Helpers; -using System; -using System.Collections.Generic; using System.Diagnostics; using System.Globalization; -using System.IO; using System.IO.Compression; -using System.Numerics; -using System.Runtime.InteropServices; using System.Text; using ZstdSharp; -using org.ohdsi.cdm.framework.desktop.DbLayer; -using Microsoft.VisualBasic; -using CsvHelper.Configuration.Attributes; namespace RunValidation { From cf26c23cfb48981ee16798fe9327d446d801cba2 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 12:15:47 +0200 Subject: [PATCH 10/37] GetActualSlices for each BuildingId, not ChunkId. Output validated chunks --- sources/RunValidation/Validation.cs | 45 +++++++++++++++++------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 1bd209ad..fea5a78d 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -37,23 +37,41 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List<(int ChunkId, var timer = new Stopwatch(); timer.Start(); + var actualSlices = GetActualSlices(vendor.Name, buildingId); + foreach (var awsChunk in GetChunks(vendor, buildingId)) { var awsChunkId = awsChunk.Key; if (chunkSlicePairs.Any() && !chunkSlicePairs.Any(s => s.ChunkId == awsChunkId)) { - Console.WriteLine("Skip chunkId " + awsChunkId); + Console.WriteLine(); + Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} skipped"); continue; } + Console.WriteLine(); + Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} validation start"); var slices = chunkSlicePairs .Where(s => s.ChunkId == awsChunkId) .Where(s => s.SliceId != null) .Select(s => s.SliceId ?? -1) //change type int? to int .ToList(); - - ValidateChunkId(vendor, buildingId, awsChunkId, slices); + + var slices2process = (!slices.Any()) + ? actualSlices + .OrderBy(s => s) + .ToList() + : slices + .Distinct() + .Where(s => actualSlices.Any(a => a == s)) + .OrderBy(s => s) + .ToList() + ; + + ValidateChunkId(vendor, buildingId, awsChunkId, slices2process); + + Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} is validated"); } Console.WriteLine(); @@ -62,21 +80,10 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List<(int ChunkId, timer.Restart(); } - private void ValidateChunkId(Vendor vendor, int buildingId, int chunkId, IEnumerable slices) + private void ValidateChunkId(Vendor vendor, int buildingId, int chunkId, List slices) { - var actualSlices = GetActualSlices(vendor.Name, buildingId); - var slices2process = (!slices.Any()) - ? actualSlices - .OrderBy(s => s) - .ToList() - : slices - .Distinct() - .Where(s => actualSlices.Any(a => a == s)) - .OrderBy(s => s) - .ToList() - ; - - var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices2process); + + var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); Parallel.ForEach(s3ObjectsBySlice, slice => { @@ -316,7 +323,7 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId if (wrongCount > 0) { - var msg = $"BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} | WrongCount={wrongCount}; Duplicates={dups} | Wrong Person Id Example={wrongPersonIds.First()}"; + var msg = $"--BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} | WrongCount={wrongCount}; Duplicates={dups} | Wrong Person Id Example={wrongPersonIds.First()}"; Console.WriteLine(msg); } @@ -325,7 +332,7 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId } catch (Exception ex) { - Console.Write(ex.Message + " | [ProcessChunk] Exception | new attempt | attempt=" + attempt); + Console.Write("--" + ex.Message + " | [ProcessChunk] Exception | new attempt | attempt=" + attempt); if (attempt > 3) { throw; From ddc7668509c774a0ebd02582a2e6ce8579c4cc62 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 12:19:13 +0200 Subject: [PATCH 11/37] fix region name --- sources/RunValidation/Validation.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index fea5a78d..1a292864 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -251,7 +251,7 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId var timer = new Stopwatch(); timer.Start(); - #region get personAppearenceStats + #region get appearenceStatsByPersonId var cnt = 0; var attempt1 = attempt; From 3e7528cf7f148c4a188aa8406005858fd3c08efa Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 12:22:30 +0200 Subject: [PATCH 12/37] fix --- sources/RunValidation/Validation.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 1a292864..0b5f82bb 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -46,7 +46,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List<(int ChunkId, if (chunkSlicePairs.Any() && !chunkSlicePairs.Any(s => s.ChunkId == awsChunkId)) { Console.WriteLine(); - Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} skipped"); + Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} is skipped"); continue; } Console.WriteLine(); From 0905f8bf72192fc1120b3d716c0d52c4c97032a1 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 12:40:17 +0200 Subject: [PATCH 13/37] Fix etlLibrary --- .../Common/Utility/EtlLibrary.cs | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index be26d760..52a15dff 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -15,12 +15,13 @@ public class Resource(string name, string value) public static class EtlLibrary { - private static IEnumerable GetETLAssemblies(string path) + private static IEnumerable GetETLAssemblies(string path) { - foreach (var assemblyFile in Directory.GetFiles(path, "*.dll")) - { - yield return Assembly.LoadFile(assemblyFile); - } + if (!string.IsNullOrEmpty(path)) + foreach (var assemblyFile in Directory.GetFiles(path, "*.dll")) + { + yield return Assembly.LoadFile(assemblyFile); + } } private static IEnumerable> FindAssemblyAndResource(string etlLibraryPath, string name) @@ -112,26 +113,28 @@ public static void LoadVendorSettings(string etlLibraryPath, IVendorSettings set public static Vendor CreateVendorInstance(string etlLibraryPath, string name) { - foreach (var assembly in GetETLAssemblies(etlLibraryPath)) + var currentAssemblies = GetETLAssemblies(Directory.GetCurrentDirectory()); + var externalAssemblies = GetETLAssemblies(etlLibraryPath); + var allAssemblies = currentAssemblies.Union(externalAssemblies); + var vendorTypes = allAssemblies + .SelectMany(s => s.GetTypes() + .Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract)); + var vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); + + if (vendorType == null) { - var vendorTypes = assembly.GetTypes().Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract); - var vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); - - if (vendorType == null) - { - name = name.ToLower().Replace("v5", "").Replace("full", ""); + name = name.ToLower().Replace("v5", "").Replace("full", ""); - vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); - } + vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); + } - if (vendorType != null) - { - Console.WriteLine("CreateVendorInstance | assembly: " + assembly.GetName().Name); - Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); - Console.WriteLine(); + if (vendorType != null) + { + Console.WriteLine("CreateVendorInstance | source path: " + etlLibraryPath); + Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); + Console.WriteLine(); - return Activator.CreateInstance(vendorType) as Vendor; - } + return Activator.CreateInstance(vendorType) as Vendor; } throw new KeyNotFoundException($"CreateVendorInstance | Vendor: {name}; LibraryPath: {etlLibraryPath} - not exists"); From bca2a9e1f54d4283c939dfe03f2445cbe51ac660 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 12:48:43 +0200 Subject: [PATCH 14/37] Reorder CreateVendorInstance arguments, make etlLibraryPath nullable --- .../org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs | 6 +++--- .../Desktop/Settings/BuildingSettings.cs | 2 +- .../Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs | 2 +- .../org.ohdsi.cdm.presentation.lambdabuilder/Function.cs | 2 +- .../org.ohdsi.cdm.presentation.lambdamerge/Function.cs | 2 +- sources/RunValidation/Program.cs | 2 +- sources/Tests/RunETL/Program.cs | 2 +- sources/Tests/RunLocal/Program.cs | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index 52a15dff..8f807c7c 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -15,7 +15,7 @@ public class Resource(string name, string value) public static class EtlLibrary { - private static IEnumerable GetETLAssemblies(string path) + private static IEnumerable GetETLAssemblies(string? path) { if (!string.IsNullOrEmpty(path)) foreach (var assemblyFile in Directory.GetFiles(path, "*.dll")) @@ -111,11 +111,11 @@ public static void LoadVendorSettings(string etlLibraryPath, IVendorSettings set } } - public static Vendor CreateVendorInstance(string etlLibraryPath, string name) + public static Vendor CreateVendorInstance(string name, string? etlLibraryPath = null) { var currentAssemblies = GetETLAssemblies(Directory.GetCurrentDirectory()); var externalAssemblies = GetETLAssemblies(etlLibraryPath); - var allAssemblies = currentAssemblies.Union(externalAssemblies); + var allAssemblies = currentAssemblies.Union(externalAssemblies); var vendorTypes = allAssemblies .SelectMany(s => s.GetTypes() .Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract)); diff --git a/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs b/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs index f0a6fe29..2a6c4125 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs @@ -214,7 +214,7 @@ private void SetFrom(IDataReader reader) Console.WriteLine("Vendor: " + reader.GetString("Vendor")); Console.WriteLine("EtlLibraryPath: " + EtlLibraryPath); - this.Vendor = EtlLibrary.CreateVendorInstance(EtlLibraryPath, reader.GetString("Vendor")); + this.Vendor = EtlLibrary.CreateVendorInstance(reader.GetString("Vendor"), EtlLibraryPath); SetVendorSettings(); SetVocabularyVersion(); diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs b/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs index 8160bc75..18ab49a7 100644 --- a/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs +++ b/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs @@ -86,7 +86,7 @@ static int Main(string[] arguments) IConfigurationRoot configuration = builder.Build(); - vendor = EtlLibrary.CreateVendorInstance(configuration.GetSection("AppSettings")["etlLibraryPath"], vendorName); + vendor = EtlLibrary.CreateVendorInstance(vendorName, configuration.GetSection("AppSettings")["etlLibraryPath"]); var builderConnectionString = configuration.GetConnectionString("Builder"); diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs b/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs index cefc8a8a..b66fe9f4 100644 --- a/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs +++ b/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs @@ -264,7 +264,7 @@ public async Task FunctionHandler(S3Event evnt, ILambdaContext context) //} var vendorName = s3Event.Object.Key.Split('.')[0].Split('/').Last(); - vendor = EtlLibrary.CreateVendorInstance(EtlLibraryPath, vendorName); + vendor = EtlLibrary.CreateVendorInstance(vendorName, EtlLibraryPath); buildingId = int.Parse(s3Event.Object.Key.Split('.')[1]); _chunkId = int.Parse(s3Event.Object.Key.Split('.')[2]); diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs b/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs index 3389f667..58152957 100644 --- a/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs +++ b/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs @@ -100,7 +100,7 @@ public async Task FunctionHandler(S3Event evnt, ILambdaContext context) try { var vendorName = _s3Event.Object.Key.Split('.')[0].Split('/').Last(); - _settings.Vendor = EtlLibrary.CreateVendorInstance(EtlLibraryPath, vendorName); + _settings.Vendor = EtlLibrary.CreateVendorInstance(vendorName, EtlLibraryPath); _settings.BuildingId = int.Parse(_s3Event.Object.Key.Split('.')[1]); _table = _s3Event.Object.Key.Split('.')[2].Trim(); _subChunkId = int.Parse(_s3Event.Object.Key.Split('.')[3]); diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 21fc2d0f..4d716fbc 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -85,7 +85,7 @@ static void RunWithOptions(Options opts) Console.WriteLine($"Current directory: {Directory.GetCurrentDirectory()}"); Console.WriteLine(); - Vendor vendor = EtlLibrary.CreateVendorInstance(opts.EtlLibraryPath, opts.Vendor); + Vendor vendor = EtlLibrary.CreateVendorInstance(opts.Vendor, opts.EtlLibraryPath); var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, opts.LocalTmpPath, _cdmFolder); validation.ValidateBuildingId(vendor, opts.BuildingId, chunkSlicePairs); } diff --git a/sources/Tests/RunETL/Program.cs b/sources/Tests/RunETL/Program.cs index c26b8c10..c960b7eb 100644 --- a/sources/Tests/RunETL/Program.cs +++ b/sources/Tests/RunETL/Program.cs @@ -30,7 +30,7 @@ static void Main(string[] args) { chunkscnt = o.ChunksCnt.Value; slicescnt = o.SlicesCnt.Value; - vendor = EtlLibrary.CreateVendorInstance(ConfigurationManager.AppSettings["etlLibraryPath"], o.Vendor); + vendor = EtlLibrary.CreateVendorInstance(o.Vendor, ConfigurationManager.AppSettings["etlLibraryPath"]); buildingid = o.Buildingid.Value; }); diff --git a/sources/Tests/RunLocal/Program.cs b/sources/Tests/RunLocal/Program.cs index 3efb76c1..9e84f38f 100644 --- a/sources/Tests/RunLocal/Program.cs +++ b/sources/Tests/RunLocal/Program.cs @@ -27,7 +27,7 @@ static void Main(string[] args) Console.WriteLine($"{Directory.GetCurrentDirectory()}"); - Process(EtlLibrary.CreateVendorInstance(args[5], args[0]), int.Parse(args[1]), int.Parse(args[2]), args[3], bool.Parse(args[4]), args[5]); + Process(EtlLibrary.CreateVendorInstance(args[0], args[5]), int.Parse(args[1]), int.Parse(args[2]), args[3], bool.Parse(args[4]), args[5]); Console.WriteLine("DONE"); Console.ReadLine(); From b6ae4122ea249acfe7b0e5b6391863955bc3ba8d Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 13:00:17 +0200 Subject: [PATCH 15/37] Prioritize external library over internal --- .../org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index 8f807c7c..f9c02f8a 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -115,7 +115,7 @@ public static Vendor CreateVendorInstance(string name, string? etlLibraryPath = { var currentAssemblies = GetETLAssemblies(Directory.GetCurrentDirectory()); var externalAssemblies = GetETLAssemblies(etlLibraryPath); - var allAssemblies = currentAssemblies.Union(externalAssemblies); + var allAssemblies = externalAssemblies.Union(currentAssemblies); //type from external library will be the first var vendorTypes = allAssemblies .SelectMany(s => s.GetTypes() .Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract)); From c9d3225097984d123f293a6e5d473326153a45b5 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 13:36:42 +0200 Subject: [PATCH 16/37] Fix etlLibrary --- .../Common/Utility/EtlLibrary.cs | 121 ++++++++++-------- 1 file changed, 69 insertions(+), 52 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index f9c02f8a..0cd4018d 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -3,6 +3,7 @@ using org.ohdsi.cdm.framework.common.Enums; using org.ohdsi.cdm.framework.common.Extensions; using org.ohdsi.cdm.framework.Common.Base; +using System.Linq; using System.Reflection; namespace org.ohdsi.cdm.framework.common.Utility @@ -15,13 +16,76 @@ public class Resource(string name, string value) public static class EtlLibrary { - private static IEnumerable GetETLAssemblies(string? path) + public static Vendor CreateVendorInstance(string name, string? etlLibraryPath = null) { - if (!string.IsNullOrEmpty(path)) - foreach (var assemblyFile in Directory.GetFiles(path, "*.dll")) + var allAssemblies = new List(); + allAssemblies.AddRange(GetETLAssemblies(etlLibraryPath)); + allAssemblies.AddRange(GetETLAssemblies(Directory.GetCurrentDirectory())); + var vendorTypes = allAssemblies + .Where(s => !s.FullName!.ToLower().Contains("System") + && !s.FullName!.ToLower().Contains("sql")) + .SelectMany(s => s.GetTypes() + .Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract)) + .ToList(); + //type from external library will be the first + var vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); + + if (vendorType == null) + { + name = name.ToLower().Replace("v5", "").Replace("full", ""); + + vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); + } + + if (vendorType != null) + { + Console.WriteLine("CreateVendorInstance | source path: " + etlLibraryPath); + Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); + Console.WriteLine(); + + return Activator.CreateInstance(vendorType) as Vendor; + } + + throw new KeyNotFoundException($"CreateVendorInstance | Vendor: {name}; LibraryPath: {etlLibraryPath} - not exists"); + } + + public static ConstructorInfo GetBuilderConstructor(string etlLibraryPath, Vendor vendor) + { + foreach (var assembly in GetETLAssemblies(etlLibraryPath)) + { + var builderTypes = assembly.GetTypes(). + Where(t => t.IsSubclassOf(typeof(PersonBuilder)) && !t.IsAbstract); + + var vendorTypePersonBuilder = builderTypes.FirstOrDefault(a => NormalizeVendorName(a.Name).Contains(NormalizeVendorName(vendor.Name), StringComparison.CurrentCultureIgnoreCase)); + + if (vendorTypePersonBuilder != null) { - yield return Assembly.LoadFile(assemblyFile); + return vendorTypePersonBuilder.GetConstructor([typeof(Vendor)]) ?? throw new InvalidOperationException($"No suitable constructor found for type {vendorTypePersonBuilder.Name}"); } + } + + throw new KeyNotFoundException($"GetBuilderConstructor | Vendor: {vendor}; LibraryPath: {etlLibraryPath} - not exists"); + } + + private static List GetETLAssemblies(string? path) + { + List assemblies = new List(); + if (!string.IsNullOrEmpty(path)) + { + var files = Directory.GetFiles(path, "*.dll"); + foreach (var file in files) + try + { + var assembly = Assembly.LoadFrom(file); + assemblies.Add(assembly); + } + catch (Exception e) + { + Console.WriteLine($"Failed to extract assebly from file {file}!"); + Console.WriteLine(e.Message); + } + } + return assemblies; } private static IEnumerable> FindAssemblyAndResource(string etlLibraryPath, string name) @@ -109,53 +173,6 @@ public static void LoadVendorSettings(string etlLibraryPath, IVendorSettings set ld.FileName = lookup.Name; settings.CombinedLookupDefinitions.Add(ld); } - } - - public static Vendor CreateVendorInstance(string name, string? etlLibraryPath = null) - { - var currentAssemblies = GetETLAssemblies(Directory.GetCurrentDirectory()); - var externalAssemblies = GetETLAssemblies(etlLibraryPath); - var allAssemblies = externalAssemblies.Union(currentAssemblies); //type from external library will be the first - var vendorTypes = allAssemblies - .SelectMany(s => s.GetTypes() - .Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract)); - var vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); - - if (vendorType == null) - { - name = name.ToLower().Replace("v5", "").Replace("full", ""); - - vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); - } - - if (vendorType != null) - { - Console.WriteLine("CreateVendorInstance | source path: " + etlLibraryPath); - Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); - Console.WriteLine(); - - return Activator.CreateInstance(vendorType) as Vendor; - } - - throw new KeyNotFoundException($"CreateVendorInstance | Vendor: {name}; LibraryPath: {etlLibraryPath} - not exists"); - } - - public static ConstructorInfo GetBuilderConstructor(string etlLibraryPath, Vendor vendor) - { - foreach (var assembly in GetETLAssemblies(etlLibraryPath)) - { - var builderTypes = assembly.GetTypes(). - Where(t => t.IsSubclassOf(typeof(PersonBuilder)) && !t.IsAbstract); - - var vendorTypePersonBuilder = builderTypes.FirstOrDefault(a => NormalizeVendorName(a.Name).Contains(NormalizeVendorName(vendor.Name), StringComparison.CurrentCultureIgnoreCase)); - - if (vendorTypePersonBuilder != null) - { - return vendorTypePersonBuilder.GetConstructor([typeof(Vendor)]) ?? throw new InvalidOperationException($"No suitable constructor found for type {vendorTypePersonBuilder.Name}"); - } - } - - throw new KeyNotFoundException($"GetBuilderConstructor | Vendor: {vendor}; LibraryPath: {etlLibraryPath} - not exists"); - } + } } } \ No newline at end of file From 2b2736e24958f26883a5b86d94adebfd963798f8 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 13:54:35 +0200 Subject: [PATCH 17/37] revert recent EtlLibrary changes --- .../Common/Utility/EtlLibrary.cs | 117 ++++++++---------- 1 file changed, 49 insertions(+), 68 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index 0cd4018d..8cf2f3b3 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -16,76 +16,12 @@ public class Resource(string name, string value) public static class EtlLibrary { - public static Vendor CreateVendorInstance(string name, string? etlLibraryPath = null) + private static IEnumerable GetETLAssemblies(string path) { - var allAssemblies = new List(); - allAssemblies.AddRange(GetETLAssemblies(etlLibraryPath)); - allAssemblies.AddRange(GetETLAssemblies(Directory.GetCurrentDirectory())); - var vendorTypes = allAssemblies - .Where(s => !s.FullName!.ToLower().Contains("System") - && !s.FullName!.ToLower().Contains("sql")) - .SelectMany(s => s.GetTypes() - .Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract)) - .ToList(); - //type from external library will be the first - var vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); - - if (vendorType == null) + foreach (var assemblyFile in Directory.GetFiles(path, "*.dll")) { - name = name.ToLower().Replace("v5", "").Replace("full", ""); - - vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); - } - - if (vendorType != null) - { - Console.WriteLine("CreateVendorInstance | source path: " + etlLibraryPath); - Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); - Console.WriteLine(); - - return Activator.CreateInstance(vendorType) as Vendor; - } - - throw new KeyNotFoundException($"CreateVendorInstance | Vendor: {name}; LibraryPath: {etlLibraryPath} - not exists"); - } - - public static ConstructorInfo GetBuilderConstructor(string etlLibraryPath, Vendor vendor) - { - foreach (var assembly in GetETLAssemblies(etlLibraryPath)) - { - var builderTypes = assembly.GetTypes(). - Where(t => t.IsSubclassOf(typeof(PersonBuilder)) && !t.IsAbstract); - - var vendorTypePersonBuilder = builderTypes.FirstOrDefault(a => NormalizeVendorName(a.Name).Contains(NormalizeVendorName(vendor.Name), StringComparison.CurrentCultureIgnoreCase)); - - if (vendorTypePersonBuilder != null) - { - return vendorTypePersonBuilder.GetConstructor([typeof(Vendor)]) ?? throw new InvalidOperationException($"No suitable constructor found for type {vendorTypePersonBuilder.Name}"); - } - } - - throw new KeyNotFoundException($"GetBuilderConstructor | Vendor: {vendor}; LibraryPath: {etlLibraryPath} - not exists"); - } - - private static List GetETLAssemblies(string? path) - { - List assemblies = new List(); - if (!string.IsNullOrEmpty(path)) - { - var files = Directory.GetFiles(path, "*.dll"); - foreach (var file in files) - try - { - var assembly = Assembly.LoadFrom(file); - assemblies.Add(assembly); - } - catch (Exception e) - { - Console.WriteLine($"Failed to extract assebly from file {file}!"); - Console.WriteLine(e.Message); - } + yield return Assembly.LoadFile(assemblyFile); } - return assemblies; } private static IEnumerable> FindAssemblyAndResource(string etlLibraryPath, string name) @@ -173,6 +109,51 @@ public static void LoadVendorSettings(string etlLibraryPath, IVendorSettings set ld.FileName = lookup.Name; settings.CombinedLookupDefinitions.Add(ld); } - } + } + + public static Vendor CreateVendorInstance(string etlLibraryPath, string name) + { + foreach (var assembly in GetETLAssemblies(etlLibraryPath)) + { + var vendorTypes = assembly.GetTypes().Where(t => t.IsSubclassOf(typeof(Vendor)) && !t.IsAbstract); + var vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); + + if (vendorType == null) + { + name = name.ToLower().Replace("v5", "").Replace("full", ""); + + vendorType = vendorTypes.FirstOrDefault(a => a.Name.Contains(name, StringComparison.CurrentCultureIgnoreCase)); + } + + if (vendorType != null) + { + Console.WriteLine("CreateVendorInstance | assembly: " + assembly.GetName().Name); + Console.WriteLine("CreateVendorInstance | vendorType: " + vendorType); + Console.WriteLine(); + + return Activator.CreateInstance(vendorType) as Vendor; + } + } + + throw new KeyNotFoundException($"CreateVendorInstance | Vendor: {name}; LibraryPath: {etlLibraryPath} - not exists"); + } + + public static ConstructorInfo GetBuilderConstructor(string etlLibraryPath, Vendor vendor) + { + foreach (var assembly in GetETLAssemblies(etlLibraryPath)) + { + var builderTypes = assembly.GetTypes(). + Where(t => t.IsSubclassOf(typeof(PersonBuilder)) && !t.IsAbstract); + + var vendorTypePersonBuilder = builderTypes.FirstOrDefault(a => NormalizeVendorName(a.Name).Contains(NormalizeVendorName(vendor.Name), StringComparison.CurrentCultureIgnoreCase)); + + if (vendorTypePersonBuilder != null) + { + return vendorTypePersonBuilder.GetConstructor([typeof(Vendor)]) ?? throw new InvalidOperationException($"No suitable constructor found for type {vendorTypePersonBuilder.Name}"); + } + } + + throw new KeyNotFoundException($"GetBuilderConstructor | Vendor: {vendor}; LibraryPath: {etlLibraryPath} - not exists"); + } } } \ No newline at end of file From e2f3cf152751c429594417af19665aa7cca87258 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 14:07:47 +0200 Subject: [PATCH 18/37] Fix --- .../org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index 8cf2f3b3..d69e3cd3 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -111,7 +111,7 @@ public static void LoadVendorSettings(string etlLibraryPath, IVendorSettings set } } - public static Vendor CreateVendorInstance(string etlLibraryPath, string name) + public static Vendor CreateVendorInstance(string name, string etlLibraryPath) { foreach (var assembly in GetETLAssemblies(etlLibraryPath)) { From 2bc5c9028d316cf6197c536a05938ea1f13ea812 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 14:15:00 +0200 Subject: [PATCH 19/37] Reorder settings display --- sources/RunValidation/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 4d716fbc..7f06b478 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -79,9 +79,9 @@ static void RunWithOptions(Options opts) Console.WriteLine($"Vendor: {opts.Vendor}"); Console.WriteLine($"Building ID: {opts.BuildingId}"); + Console.WriteLine($"ChunkSlicePairs: {string.Join(", ", chunkSlicePairsStrings)}"); Console.WriteLine($"EtlLibraryPath: {opts.EtlLibraryPath}"); Console.WriteLine($"LocalTmpPath: {opts.LocalTmpPath}"); - Console.WriteLine($"ChunkSlicePairs: {string.Join(", ", chunkSlicePairsStrings)}"); Console.WriteLine($"Current directory: {Directory.GetCurrentDirectory()}"); Console.WriteLine(); From 55740fb969965b7a2f4bd5839af2a1e9153a2986 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 15:55:34 +0200 Subject: [PATCH 20/37] Remove specifying slices to process. Process every slice present --- sources/RunValidation/Program.cs | 25 +++++-------------------- sources/RunValidation/Validation.cs | 25 ++++--------------------- 2 files changed, 9 insertions(+), 41 deletions(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 7f06b478..6613064e 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -23,7 +23,7 @@ internal class Options public string LocalTmpPath { get; set; } = "C:\\_tmp"; [Option('c', "chunks", Separator = ',', HelpText = "(Optional) Comma-separated list of chunk IDs to process. All of them, if omitted.")] - public IEnumerable ChunkSlicePairs { get; set; } = new List(); + public IEnumerable Chunks { get; set; } = new List(); [Usage(ApplicationAlias = "RunValidation")] public static IEnumerable Examples @@ -35,9 +35,7 @@ public static IEnumerable Examples yield return new Example("Process all chunks for an external .dll", new Options { Vendor = "ExternalVendorName", BuildingId = 123, EtlLibraryPath = "C:\\PathToExternalDllFolder"}); yield return new Example("Process specified chunks", new Options - { Vendor = "VendorName", BuildingId = 123, ChunkSlicePairs = new List { "1", "2", "3" } }); - yield return new Example("Process specified pairs chunk:slice. If : omitted, then process all slices for a given chunk", new Options - { Vendor = "VendorName", BuildingId = 123, ChunkSlicePairs = new List { "1", "2:1", "2:2", "3" } }); + { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1, 2, 3 } }); } } } @@ -58,20 +56,7 @@ static void Main(string[] args) static void RunWithOptions(Options opts) { - var chunkSlicePairs = new List<(int Chunk, int? Slice)>(); - foreach (var raw in opts.ChunkSlicePairs) - { - var parts = raw.Replace(" ", "").Split(':'); - - int chunkId = int.Parse(parts[0]); - - int? sliceId = null; - if (parts.Length > 1 && int.TryParse(parts[1], out int sliceIdTmp)) - sliceId = sliceIdTmp; - - chunkSlicePairs.Add((chunkId, sliceId)); - } - var chunkSlicePairsStrings = chunkSlicePairs.Select(s => s.Chunk + (s.Slice.HasValue ? ":" + s.Slice : "")).ToList(); + var chunks = opts.Chunks.ToList() ?? new List(); Console.WriteLine("Options:"); //Console.WriteLine($"Keys: {_awsAccessKeyId} - {_awsSecretAccessKey}"); @@ -79,7 +64,7 @@ static void RunWithOptions(Options opts) Console.WriteLine($"Vendor: {opts.Vendor}"); Console.WriteLine($"Building ID: {opts.BuildingId}"); - Console.WriteLine($"ChunkSlicePairs: {string.Join(", ", chunkSlicePairsStrings)}"); + Console.WriteLine($"Chunks: {string.Join(", ", chunks)}"); Console.WriteLine($"EtlLibraryPath: {opts.EtlLibraryPath}"); Console.WriteLine($"LocalTmpPath: {opts.LocalTmpPath}"); Console.WriteLine($"Current directory: {Directory.GetCurrentDirectory()}"); @@ -87,7 +72,7 @@ static void RunWithOptions(Options opts) Vendor vendor = EtlLibrary.CreateVendorInstance(opts.Vendor, opts.EtlLibraryPath); var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, opts.LocalTmpPath, _cdmFolder); - validation.ValidateBuildingId(vendor, opts.BuildingId, chunkSlicePairs); + validation.ValidateBuildingId(vendor, opts.BuildingId, chunks); } static void HandleParseError(IEnumerable errs) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 0b5f82bb..fd905b3e 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -31,19 +31,19 @@ public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string #region Methods - public void ValidateBuildingId(Vendor vendor, int buildingId, List<(int ChunkId, int? SliceId)> chunkSlicePairs) + public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) { var _wrong = new List(); var timer = new Stopwatch(); timer.Start(); - var actualSlices = GetActualSlices(vendor.Name, buildingId); + var actualSlices = GetActualSlices(vendor.Name, buildingId).ToList().OrderBy(s => s).ToList(); foreach (var awsChunk in GetChunks(vendor, buildingId)) { var awsChunkId = awsChunk.Key; - if (chunkSlicePairs.Any() && !chunkSlicePairs.Any(s => s.ChunkId == awsChunkId)) + if (chunks.Any() && !chunks.Any(s => s == awsChunkId)) { Console.WriteLine(); Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} is skipped"); @@ -52,24 +52,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List<(int ChunkId, Console.WriteLine(); Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} validation start"); - var slices = chunkSlicePairs - .Where(s => s.ChunkId == awsChunkId) - .Where(s => s.SliceId != null) - .Select(s => s.SliceId ?? -1) //change type int? to int - .ToList(); - - var slices2process = (!slices.Any()) - ? actualSlices - .OrderBy(s => s) - .ToList() - : slices - .Distinct() - .Where(s => actualSlices.Any(a => a == s)) - .OrderBy(s => s) - .ToList() - ; - - ValidateChunkId(vendor, buildingId, awsChunkId, slices2process); + ValidateChunkId(vendor, buildingId, awsChunkId, actualSlices); Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} is validated"); } From c68e6a237e984e26bcbfedcd082fc6bef31a2140 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 17:04:41 +0200 Subject: [PATCH 21/37] Add smooth progress bars for chunk processing --- sources/RunValidation/RunValidation.csproj | 1 + sources/RunValidation/Validation.cs | 113 +++++++++++++++------ 2 files changed, 85 insertions(+), 29 deletions(-) diff --git a/sources/RunValidation/RunValidation.csproj b/sources/RunValidation/RunValidation.csproj index 69bd35ab..54099441 100644 --- a/sources/RunValidation/RunValidation.csproj +++ b/sources/RunValidation/RunValidation.csproj @@ -9,6 +9,7 @@ + diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index fd905b3e..1d631000 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -10,6 +10,7 @@ using System.IO.Compression; using System.Text; using ZstdSharp; +using Spectre.Console; namespace RunValidation { @@ -37,48 +38,48 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) var timer = new Stopwatch(); timer.Start(); - var actualSlices = GetActualSlices(vendor.Name, buildingId).ToList().OrderBy(s => s).ToList(); + var actualSlices = GetActualSlices(vendor.Name, buildingId).OrderBy(s => s).ToList(); - foreach (var awsChunk in GetChunks(vendor, buildingId)) - { - var awsChunkId = awsChunk.Key; + int chunkCount = 0; + var actualChunks = new List>>>(); - if (chunks.Any() && !chunks.Any(s => s == awsChunkId)) + AnsiConsole.Status() + .Spinner(Spinner.Known.Dots) + .Start("Getting all chunks...", ctx => { - Console.WriteLine(); - Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} is skipped"); - continue; - } - Console.WriteLine(); - Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} validation start"); + var progress = new Progress(count => + { + chunkCount = count; + ctx.Status($"Getting all chunks... (Chunks obtained: {chunkCount})"); + }); + + foreach (var chunk in GetChunks(vendor, buildingId, progress)) + { + actualChunks.Add(chunk); + } + }); - ValidateChunkId(vendor, buildingId, awsChunkId, actualSlices); + timer.Stop(); + AnsiConsole.MarkupLine($"[green]Getting all {actualChunks.Count} chunks done. It took {timer.ElapsedMilliseconds / 1000}s[/]"); + timer.Restart(); - Console.WriteLine($"BuildingId {buildingId} ChunkId {awsChunkId} is validated"); - } + ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices, actualChunks); - Console.WriteLine(); timer.Stop(); - Console.WriteLine($"Done. Total seconds={timer.ElapsedMilliseconds/1000}s"); + AnsiConsole.MarkupLine($"[bold]Done. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); timer.Restart(); } - private void ValidateChunkId(Vendor vendor, int buildingId, int chunkId, List slices) - { - var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); - Parallel.ForEach(s3ObjectsBySlice, slice => - { - ValidateSliceId(vendor, buildingId, chunkId, slice.Key, slice.Value.personObjects, slice.Value.metadataObjects); - }); - } - private IEnumerable>>> GetChunks(Vendor vendor, int buildingId) + private IEnumerable>>> GetChunks(Vendor vendor, int buildingId, IProgress progress) { var currentChunkId = 0; - var result = new KeyValuePair>>(0, []); + var result = new KeyValuePair>>(0, new Dictionary>()); var prefix = $"{vendor}/{buildingId}/_chunks"; + int chunkCount = 0; + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) { var request = new ListObjectsV2Request @@ -105,15 +106,18 @@ private IEnumerable>>> GetChunks if (currentChunkId != chunkId) { if (result.Value.Count > 0) + { yield return result; + chunkCount++; + progress?.Report(chunkCount); + } - result = new KeyValuePair>>(chunkId, - []); + result = new KeyValuePair>>(chunkId, new Dictionary>()); currentChunkId = chunkId; } var personId = long.Parse(line.Split('\t')[1]); - result.Value.Add(personId, []); + result.Value.Add(personId, new List()); line = reader.ReadLine(); } @@ -121,9 +125,60 @@ private IEnumerable>>> GetChunks } if (result.Value.Count > 0) + { yield return result; + chunkCount++; + progress?.Report(chunkCount); + } + } + + + + private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices, List>>> actualChunks) + { + AnsiConsole.Progress() + .AutoClear(false) + .HideCompleted(false) + .Columns( + new TaskDescriptionColumn(), + new ProgressBarColumn(), + new PercentageColumn(), + new RemainingTimeColumn(), + new SpinnerColumn()) + .Start(ctx => + { + foreach (var awsChunk in actualChunks) + { + var awsChunkId = awsChunk.Key; + + if (chunks.Any() && !chunks.Contains(awsChunkId)) + { + AnsiConsole.MarkupLine($"[yellow]BuildingId {buildingId} ChunkId {awsChunkId} is skipped[/]"); + continue; + } + + + var task = ctx.AddTask($"Chunk {awsChunkId}", maxValue: actualSlices.Count); + + ValidateChunkIdWithProgress(vendor, buildingId, awsChunkId, actualSlices, task); + } + }); } + + private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, List slices, ProgressTask task) + { + var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); + + foreach (var slice in s3ObjectsBySlice) + { + ValidateSliceId(vendor, buildingId, chunkId, slice.Key, slice.Value.personObjects, slice.Value.metadataObjects); + + task.Increment(1); + } + } + + private HashSet GetActualSlices(string vendorName, int buildingId) { var slices = new HashSet(); From 52360a5489273f3a8796e28969a0bc54726babf4 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 24 Oct 2024 17:32:49 +0200 Subject: [PATCH 22/37] Add parallel execution for chunks --- sources/RunValidation/Validation.cs | 30 ++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 1d631000..9ea3cdb8 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -11,6 +11,7 @@ using System.Text; using ZstdSharp; using Spectre.Console; +using System.Collections.Concurrent; namespace RunValidation { @@ -136,6 +137,8 @@ private IEnumerable>>> GetChunks private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices, List>>> actualChunks) { + AnsiConsole.MarkupLine("\n"); + AnsiConsole.Progress() .AutoClear(false) .HideCompleted(false) @@ -147,32 +150,45 @@ private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List new SpinnerColumn()) .Start(ctx => { + var tasks = new ConcurrentDictionary(); + foreach (var awsChunk in actualChunks) { var awsChunkId = awsChunk.Key; if (chunks.Any() && !chunks.Contains(awsChunkId)) { - AnsiConsole.MarkupLine($"[yellow]BuildingId {buildingId} ChunkId {awsChunkId} is skipped[/]"); continue; } - var task = ctx.AddTask($"Chunk {awsChunkId}", maxValue: actualSlices.Count); - - ValidateChunkIdWithProgress(vendor, buildingId, awsChunkId, actualSlices, task); + tasks[awsChunkId] = task; } + + Parallel.ForEach(actualChunks + , new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1 } // leave 1 core for UI and OS + , awsChunk => + { + var awsChunkId = awsChunk.Key; + var awsChunkPersonIds = awsChunk.Value.Keys.ToHashSet(); + + if (tasks.TryGetValue(awsChunkId, out var task)) + { + ValidateChunkIdWithProgress(vendor, buildingId, awsChunkId, awsChunkPersonIds, actualSlices, task); + } + }); }); } - private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, List slices, ProgressTask task) + + private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, HashSet chunkPersonIds, List slices, ProgressTask task) { var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); foreach (var slice in s3ObjectsBySlice) { - ValidateSliceId(vendor, buildingId, chunkId, slice.Key, slice.Value.personObjects, slice.Value.metadataObjects); + ValidateSliceId(vendor, buildingId, chunkId, slice.Key, chunkPersonIds, slice.Value.personObjects, slice.Value.metadataObjects); task.Increment(1); } @@ -273,7 +289,7 @@ private IEnumerable>> GetObjects(Vendor vendor, int bu } } - private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId, int sliceId, + private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId, int sliceId, HashSet chunkPersonIds, List personObjects, List metadataObjects) { var attempt = 0; From f41ad462e8ec42df0e23acf92f344f60e85b63c9 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 30 Oct 2024 14:58:04 +0100 Subject: [PATCH 23/37] Add checking for personIds which are neither in Person or Metadata --- sources/RunValidation/Validation.cs | 59 ++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 9ea3cdb8..f6651f12 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -67,7 +67,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices, actualChunks); timer.Stop(); - AnsiConsole.MarkupLine($"[bold]Done. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); + AnsiConsole.MarkupLine($"[green]Done. Problematic chunks, if any, are described above the chunk progress. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); timer.Restart(); } @@ -182,16 +182,30 @@ private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List - private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, HashSet chunkPersonIds, List slices, ProgressTask task) + private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, HashSet chunkPersonIds, List slices, ProgressTask task) { var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); + var personIdsInBatchAndPersonOrMetadata = new HashSet(); + foreach (var slice in s3ObjectsBySlice) { - ValidateSliceId(vendor, buildingId, chunkId, slice.Key, chunkPersonIds, slice.Value.personObjects, slice.Value.metadataObjects); - + var slicePersonIdsInBatchAndPersonOrMetadata = ValidateSliceId(vendor, buildingId, chunkId, slice.Key, chunkPersonIds, slice.Value.personObjects, slice.Value.metadataObjects); + foreach (var slicePersonId in slicePersonIdsInBatchAndPersonOrMetadata) + personIdsInBatchAndPersonOrMetadata.Add(slicePersonId); task.Increment(1); } + + var inBatchOnlyPersonIds = chunkPersonIds + .Where(s => !personIdsInBatchAndPersonOrMetadata.TryGetValue(s, out long actualValue)) + .ToHashSet(); + + if (inBatchOnlyPersonIds.Count > 0) + { + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyIdsCount={inBatchOnlyPersonIds.Count} | Id Example={inBatchOnlyPersonIds.First()}[/]"; + AnsiConsole.MarkupLine(msg); + } + } @@ -300,12 +314,12 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId try { attempt++; - var appearenceStatsByPersonId = new Dictionary(); - var timer = new Stopwatch(); timer.Start(); - #region get appearenceStatsByPersonId + #region var appearanceStatsByPersonId = new Dictionary(); + var appearanceStatsByPersonId = new Dictionary(); + var cnt = 0; var attempt1 = attempt; @@ -331,19 +345,19 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId while (csv.Read()) { var personId = (long)csv.GetField(typeof(long), 0); - lock (appearenceStatsByPersonId) + lock (appearanceStatsByPersonId) { - if (!appearenceStatsByPersonId.ContainsKey(personId)) - appearenceStatsByPersonId[personId] = (0, 0); + if (!appearanceStatsByPersonId.ContainsKey(personId)) + appearanceStatsByPersonId[personId] = (0, 0); - var tuple = appearenceStatsByPersonId[personId]; + var tuple = appearanceStatsByPersonId[personId]; if (o.Key.Contains("PERSON")) tuple.InPersonCount++; else if (o.Key.Contains("METADATA_TMP")) tuple.InMetadataCount++; - appearenceStatsByPersonId[personId] = tuple; + appearanceStatsByPersonId[personId] = tuple; } } Interlocked.Increment(ref cnt); @@ -351,15 +365,25 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId #endregion + var appearanceStatsByChunkPersonId = chunkPersonIds + .Select(s => appearanceStatsByPersonId.ContainsKey(s) + ? KeyValuePair.Create(s, Tuple.Create(appearanceStatsByPersonId[s].InPersonCount, appearanceStatsByPersonId[s].InMetadataCount)) + : KeyValuePair.Create(s, Tuple.Create(0, 0))) + .ToDictionary(); + int wrongCount = 0; var dups = 0; + var personIdsInBatchAndPersonOrMetadata = new HashSet(); var wrongPersonIds = new HashSet(); - foreach (var kvp in appearenceStatsByPersonId) + foreach (var kvp in appearanceStatsByPersonId) { var personId = kvp.Key; var stats = kvp.Value; + if(stats.InPersonCount > 0 || stats.InMetadataCount > 0) + personIdsInBatchAndPersonOrMetadata.Add(personId); + //check InPersonCount just in case, InMetadataCount should actually suffice if (stats.InPersonCount > 1 || stats.InMetadataCount > 0) { @@ -377,16 +401,17 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId if (wrongCount > 0) { - var msg = $"--BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} | WrongCount={wrongCount}; Duplicates={dups} | Wrong Person Id Example={wrongPersonIds.First()}"; - Console.WriteLine(msg); + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} | WrongCount={wrongCount}; Duplicates={dups} | Wrong Person Id Example={wrongPersonIds.First()}[/]"; + AnsiConsole.MarkupLine(msg); } complete = true; - return wrongPersonIds; + return personIdsInBatchAndPersonOrMetadata; } catch (Exception ex) { - Console.Write("--" + ex.Message + " | [ProcessChunk] Exception | new attempt | attempt=" + attempt); + var msg = $"[red]{ex.Message} | ProcessChunk Exception | new attempt | attempt={attempt}[/]"; + AnsiConsole.MarkupLine(msg); if (attempt > 3) { throw; From e6d2c907fc5b0f0a5c88da8f95465e7c8ea959a5 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 30 Oct 2024 18:15:16 +0100 Subject: [PATCH 24/37] Memory management --- sources/RunValidation/Program.cs | 1 + sources/RunValidation/Validation.cs | 209 +++++++++++++++------------- 2 files changed, 115 insertions(+), 95 deletions(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 6613064e..220682ef 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -51,6 +51,7 @@ static void Main(string[] args) .WithParsed(RunWithOptions) .WithNotParsed(HandleParseError); + GC.Collect(GC.MaxGeneration, GCCollectionMode.Aggressive, true, true); Console.ReadLine(); } diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index f6651f12..b92662fe 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -18,6 +18,37 @@ namespace RunValidation public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string bucket, string tmpFolder, string cdmFolder) { + #region classes + + public class PersonInS3Chunk(long PersonId, int ChunkId, bool IsFromBatch = true) + { + public long PersonId { get; set; } = PersonId; + public int ChunkId { get; set; } = ChunkId; + public bool IsFromBatch { get; set; } = IsFromBatch; + public int? SliceId { get; set; } + public int? InPersonFilesCount { get; set; } = 0; + public int? InMetadataFilesCount { get; set; } = 0; + + public override int GetHashCode() + { + return PersonId.GetHashCode(); //assumming that a single PersonId is never duplicated in a single ChunkId + } + + public override bool Equals(object? obj) + { + if (obj is not PersonInS3Chunk) + return false; + return ((PersonInS3Chunk)obj).GetHashCode() == this.GetHashCode(); + } + + public override string ToString() + { + return $"{ChunkId} - {SliceId?.ToString() ?? "???"} - {PersonId}"; + } + } + + #endregion + #region Fields private readonly string _awsAccessKeyId = awsAccessKeyId; @@ -42,7 +73,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) var actualSlices = GetActualSlices(vendor.Name, buildingId).OrderBy(s => s).ToList(); int chunkCount = 0; - var actualChunks = new List>>>(); + var personsByChunkId = new Dictionary>(); AnsiConsole.Status() .Spinner(Spinner.Known.Dots) @@ -54,17 +85,14 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) ctx.Status($"Getting all chunks... (Chunks obtained: {chunkCount})"); }); - foreach (var chunk in GetChunks(vendor, buildingId, progress)) - { - actualChunks.Add(chunk); - } + personsByChunkId = GetPersonsByChunkId(vendor, buildingId, chunks, progress); }); timer.Stop(); - AnsiConsole.MarkupLine($"[green]Getting all {actualChunks.Count} chunks done. It took {timer.ElapsedMilliseconds / 1000}s[/]"); + AnsiConsole.MarkupLine($"[green]Getting all {personsByChunkId.Keys.Count} chunks done. It took {timer.ElapsedMilliseconds / 1000}s[/]"); timer.Restart(); - ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices, actualChunks); + ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices, personsByChunkId); timer.Stop(); AnsiConsole.MarkupLine($"[green]Done. Problematic chunks, if any, are described above the chunk progress. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); @@ -74,12 +102,13 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) - private IEnumerable>>> GetChunks(Vendor vendor, int buildingId, IProgress progress) - { - var currentChunkId = 0; - var result = new KeyValuePair>>(0, new Dictionary>()); + private Dictionary> GetPersonsByChunkId(Vendor vendor, int buildingId, List chunks, IProgress progress) + { + var persons = new Dictionary>(); + var prefix = $"{vendor}/{buildingId}/_chunks"; int chunkCount = 0; + int previousChunk = -1; using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) { @@ -97,45 +126,42 @@ private IEnumerable>>> GetChunks using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); using var responseStream = transferUtility.OpenStream(_bucket, o.Key); using var bufferedStream = new BufferedStream(responseStream); - using var gzipStream = new GZipStream(bufferedStream, CompressionMode.Decompress); - using var reader = new StreamReader(gzipStream, Encoding.Default); + using Stream compressedStream = o.Key.EndsWith(".gz") + ? new GZipStream(bufferedStream, CompressionMode.Decompress) + : new DecompressionStream(bufferedStream) //.zst + ; + using var reader = new StreamReader(compressedStream, Encoding.Default); string? line = reader.ReadLine(); while (!string.IsNullOrEmpty(line)) { - var chunkId = int.Parse(line.Split('\t')[0]); + var splits = line.Split('\t'); + var chunkId = int.Parse(splits[0]); + var personId = long.Parse(splits[1]); - if (currentChunkId != chunkId) + if (chunkId != previousChunk) { - if (result.Value.Count > 0) - { - yield return result; - chunkCount++; - progress?.Report(chunkCount); - } - - result = new KeyValuePair>>(chunkId, new Dictionary>()); - currentChunkId = chunkId; + previousChunk = chunkId; + chunkCount++; + progress?.Report(chunkCount); } + if (chunks.Any() && !chunks.Any(s => s == chunkId)) + break; - var personId = long.Parse(line.Split('\t')[1]); - result.Value.Add(personId, new List()); + if (!persons.ContainsKey(chunkId)) + persons.Add(chunkId, new HashSet()); + persons[chunkId].Add(new PersonInS3Chunk(personId, chunkId)); line = reader.ReadLine(); } } } - if (result.Value.Count > 0) - { - yield return result; - chunkCount++; - progress?.Report(chunkCount); - } + return persons; } - private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices, List>>> actualChunks) + private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices, Dictionary> personsByChunkId) { AnsiConsole.MarkupLine("\n"); @@ -152,10 +178,8 @@ private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List { var tasks = new ConcurrentDictionary(); - foreach (var awsChunk in actualChunks) + foreach (var awsChunkId in personsByChunkId.Keys) { - var awsChunkId = awsChunk.Key; - if (chunks.Any() && !chunks.Contains(awsChunkId)) { continue; @@ -165,44 +189,47 @@ private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List tasks[awsChunkId] = task; } - Parallel.ForEach(actualChunks - , new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1 } // leave 1 core for UI and OS - , awsChunk => + Parallel.ForEach(personsByChunkId.Keys.OrderBy(s => s) + , new ParallelOptions {MaxDegreeOfParallelism = Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1 } // leave 1 core for UI and OS + , awsChunkId => { - var awsChunkId = awsChunk.Key; - var awsChunkPersonIds = awsChunk.Value.Keys.ToHashSet(); + var awsChunkPersonIds = personsByChunkId[awsChunkId]; if (tasks.TryGetValue(awsChunkId, out var task)) { - ValidateChunkIdWithProgress(vendor, buildingId, awsChunkId, awsChunkPersonIds, actualSlices, task); + ValidateChunkIdWithProgress(vendor, buildingId, awsChunkPersonIds, actualSlices, task); } + + personsByChunkId[awsChunkId].Clear(); + awsChunkPersonIds = null; + GC.Collect(); }); + GC.KeepAlive(personsByChunkId); }); } - private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, HashSet chunkPersonIds, List slices, ProgressTask task) + private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, HashSet chunkPersonIds, List slices, ProgressTask task) { - var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); + var chunkId = chunkPersonIds.First().ChunkId; - var personIdsInBatchAndPersonOrMetadata = new HashSet(); + var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); foreach (var slice in s3ObjectsBySlice) { - var slicePersonIdsInBatchAndPersonOrMetadata = ValidateSliceId(vendor, buildingId, chunkId, slice.Key, chunkPersonIds, slice.Value.personObjects, slice.Value.metadataObjects); - foreach (var slicePersonId in slicePersonIdsInBatchAndPersonOrMetadata) - personIdsInBatchAndPersonOrMetadata.Add(slicePersonId); + var slicePersonIds = ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.personObjects, slice.Value.metadataObjects); task.Increment(1); } var inBatchOnlyPersonIds = chunkPersonIds - .Where(s => !personIdsInBatchAndPersonOrMetadata.TryGetValue(s, out long actualValue)) + .Where(s => s.InMetadataFilesCount + s.InPersonFilesCount == 0) .ToHashSet(); + if (inBatchOnlyPersonIds.Count > 0) { - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyIdsCount={inBatchOnlyPersonIds.Count} | Id Example={inBatchOnlyPersonIds.First()}[/]"; + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersonIds.Count} | Id Example={inBatchOnlyPersonIds.First().PersonId}[/]"; AnsiConsole.MarkupLine(msg); } @@ -303,7 +330,18 @@ private IEnumerable>> GetObjects(Vendor vendor, int bu } } - private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId, int sliceId, HashSet chunkPersonIds, + /// + /// This method alters members of chunkPersonIds collection + /// + /// + /// + /// + /// + /// + /// + /// + /// Subset of chunkPersonIds for the specified sliceId + private HashSet ValidateSliceId(HashSet chunkPersonIds, Vendor vendor, int buildingId, int chunkId, int sliceId, List personObjects, List metadataObjects) { var attempt = 0; @@ -317,9 +355,7 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId var timer = new Stopwatch(); timer.Start(); - #region var appearanceStatsByPersonId = new Dictionary(); - var appearanceStatsByPersonId = new Dictionary(); - + #region chunkPersonIds -> set counts var cnt = 0; var attempt1 = attempt; @@ -345,19 +381,20 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId while (csv.Read()) { var personId = (long)csv.GetField(typeof(long), 0); - lock (appearanceStatsByPersonId) + lock (chunkPersonIds) { - if (!appearanceStatsByPersonId.ContainsKey(personId)) - appearanceStatsByPersonId[personId] = (0, 0); + var localPersonInS3 = new PersonInS3Chunk(personId, chunkId, false); + if (chunkPersonIds.TryGetValue(localPersonInS3, out var actual)) + localPersonInS3 = actual; + else + chunkPersonIds.Add(localPersonInS3); - var tuple = appearanceStatsByPersonId[personId]; + localPersonInS3.SliceId ??= sliceId; if (o.Key.Contains("PERSON")) - tuple.InPersonCount++; + localPersonInS3.InPersonFilesCount++; else if (o.Key.Contains("METADATA_TMP")) - tuple.InMetadataCount++; - - appearanceStatsByPersonId[personId] = tuple; + localPersonInS3.InMetadataFilesCount++; } } Interlocked.Increment(ref cnt); @@ -365,48 +402,30 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId #endregion - var appearanceStatsByChunkPersonId = chunkPersonIds - .Select(s => appearanceStatsByPersonId.ContainsKey(s) - ? KeyValuePair.Create(s, Tuple.Create(appearanceStatsByPersonId[s].InPersonCount, appearanceStatsByPersonId[s].InMetadataCount)) - : KeyValuePair.Create(s, Tuple.Create(0, 0))) - .ToDictionary(); - - int wrongCount = 0; - var dups = 0; - var personIdsInBatchAndPersonOrMetadata = new HashSet(); - var wrongPersonIds = new HashSet(); - - foreach (var kvp in appearanceStatsByPersonId) - { - var personId = kvp.Key; - var stats = kvp.Value; + var slicePersonIds = chunkPersonIds + .Where(s => s.SliceId == sliceId) + .ToHashSet(); - if(stats.InPersonCount > 0 || stats.InMetadataCount > 0) - personIdsInBatchAndPersonOrMetadata.Add(personId); + var slicePersonIdsDuplicated = slicePersonIds + .Where(s => s.InPersonFilesCount + s.InMetadataFilesCount > 1) + .ToHashSet(); - //check InPersonCount just in case, InMetadataCount should actually suffice - if (stats.InPersonCount > 1 || stats.InMetadataCount > 0) - { - wrongCount++; - - if (stats.InPersonCount > 1 || stats.InMetadataCount > 1) - dups++; - - if(!wrongPersonIds.Contains(personId)) - wrongPersonIds.Add(personId); - } - } + var slicePersonIdsWrongCount = slicePersonIds + .Where(s => s.InPersonFilesCount != 1 || s.InMetadataFilesCount != 0 || !s.IsFromBatch) + .ToHashSet(); timer.Stop(); - if (wrongCount > 0) + if (slicePersonIdsWrongCount.Count > 0) { - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} | WrongCount={wrongCount}; Duplicates={dups} | Wrong Person Id Example={wrongPersonIds.First()}[/]"; + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} " + + $"| WrongCount={slicePersonIdsWrongCount.Count}; Duplicates={slicePersonIdsDuplicated.Count} " + + $"| Wrong Person Id Example={slicePersonIdsWrongCount.First().PersonId}[/]"; AnsiConsole.MarkupLine(msg); } complete = true; - return personIdsInBatchAndPersonOrMetadata; + return slicePersonIds; } catch (Exception ex) { @@ -418,7 +437,7 @@ private HashSet ValidateSliceId(Vendor vendor, int buildingId, int chunkId } } } - return null; + return new HashSet(); } #endregion From 0cf7bf7f97baf8c8ace6d38d4d20f34be5a5752c Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 30 Oct 2024 18:57:10 +0100 Subject: [PATCH 25/37] Output 1 SliceId to reload if PersonId is in chunk? but not in Person or Metadata --- sources/RunValidation/Validation.cs | 139 ++++++++++++++++++++++++++-- 1 file changed, 133 insertions(+), 6 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index b92662fe..a74df2ca 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -12,6 +12,7 @@ using ZstdSharp; using Spectre.Console; using System.Collections.Concurrent; +using System.IO; namespace RunValidation { @@ -20,9 +21,11 @@ public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string { #region classes - public class PersonInS3Chunk(long PersonId, int ChunkId, bool IsFromBatch = true) + public class PersonInS3Chunk(long PersonId, Vendor Vendor, int BuildingId, int ChunkId, bool IsFromBatch = true) { public long PersonId { get; set; } = PersonId; + public Vendor Vendor { get; set; } = Vendor; + public int BuildingId { get; set; } = BuildingId; public int ChunkId { get; set; } = ChunkId; public bool IsFromBatch { get; set; } = IsFromBatch; public int? SliceId { get; set; } @@ -31,7 +34,7 @@ public class PersonInS3Chunk(long PersonId, int ChunkId, bool IsFromBatch = true public override int GetHashCode() { - return PersonId.GetHashCode(); //assumming that a single PersonId is never duplicated in a single ChunkId + return Vendor.GetHashCode() ^ BuildingId.GetHashCode() ^ ChunkId.GetHashCode() ^ PersonId.GetHashCode(); //assumming that a single PersonId is never duplicated in a single ChunkId } public override bool Equals(object? obj) @@ -43,7 +46,7 @@ public override bool Equals(object? obj) public override string ToString() { - return $"{ChunkId} - {SliceId?.ToString() ?? "???"} - {PersonId}"; + return $"{Vendor} - {BuildingId} - {ChunkId} - {SliceId?.ToString() ?? "???"} - {PersonId}"; } } @@ -149,7 +152,7 @@ private Dictionary> GetPersonsByChunkId(Vendor ven if (!persons.ContainsKey(chunkId)) persons.Add(chunkId, new HashSet()); - persons[chunkId].Add(new PersonInS3Chunk(personId, chunkId)); + persons[chunkId].Add(new PersonInS3Chunk(personId, vendor, buildingId, chunkId)); line = reader.ReadLine(); } @@ -229,7 +232,11 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, HashSet if (inBatchOnlyPersonIds.Count > 0) { - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersonIds.Count} | Id Example={inBatchOnlyPersonIds.First().PersonId}[/]"; + var inBatchOnlyExample = inBatchOnlyPersonIds.First(); + inBatchOnlyExample.SliceId = FindSlice(inBatchOnlyExample, vendor.PersonTableName, vendor.PersonIdIndex); + + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersonIds.Count} " + + $"| Id Example={inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; AnsiConsole.MarkupLine(msg); } @@ -383,7 +390,7 @@ private HashSet ValidateSliceId(HashSet chunkP var personId = (long)csv.GetField(typeof(long), 0); lock (chunkPersonIds) { - var localPersonInS3 = new PersonInS3Chunk(personId, chunkId, false); + var localPersonInS3 = new PersonInS3Chunk(personId, vendor, buildingId, chunkId, false); if (chunkPersonIds.TryGetValue(localPersonInS3, out var actual)) localPersonInS3 = actual; else @@ -440,6 +447,126 @@ private HashSet ValidateSliceId(HashSet chunkP return new HashSet(); } + /// + /// + /// + /// This should have Vendor, BuildingId, ChunkId, and PersonId information + /// + /// + private int? FindSlice(PersonInS3Chunk person, string table, int personIndex) + { + var prefix = $"{person.Vendor.Name}/{person.BuildingId}/raw/{person.ChunkId}/{table}/{table}"; + + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = prefix + }; + + var r = client.ListObjectsV2Async(request); + r.Wait(); + var response = r.Result; + var rows = new List(); + foreach (var o in response.S3Objects) + { + using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(_bucket, o.Key); + { + using var bufferedStream = new BufferedStream(responseStream); + using Stream compressedStream = o.Key.EndsWith(".gz") + ? new GZipStream(bufferedStream, CompressionMode.Decompress) + : new DecompressionStream(bufferedStream) //.zst + ; + using var reader = new StreamReader(compressedStream, Encoding.Default); + string? line = reader.ReadLine(); + while (!string.IsNullOrEmpty(line)) + { + var personId = long.Parse(line.Split('\t')[personIndex]); + if (person.PersonId == personId) + { + var chars = o.Key + .Split('/') + .Last() + .SkipWhile(s => !char.IsDigit(s)) + .TakeWhile(s => char.IsDigit(s)) + .ToArray(); + var sliceId = int.Parse(new string(chars)); + return sliceId; + } + line = reader.ReadLine(); + } + } + } + } + + return null; + } + + private void Clean(Vendor vendor, int buildingId, int chunkId, string table, int slice) + { + var attempt = 0; + var complete = false; + + while (!complete) + { + try + { + attempt++; + + var perfix = $"{vendor}/{buildingId}/{_cdmFolder}/{table}/{table}.{slice}.{chunkId}."; + + using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) + { + var request = new ListObjectsV2Request + { + BucketName = _bucket, + Prefix = perfix + }; + ListObjectsV2Response response; + do + { + using var getListObjects = client.ListObjectsV2Async(request); + getListObjects.Wait(); + response = getListObjects.Result; + + var multiObjectDeleteRequest = new DeleteObjectsRequest + { + BucketName = _bucket + }; + + foreach (var o in response.S3Objects) + { + multiObjectDeleteRequest.AddKey(o.Key, null); + } + + if (response.S3Objects.Count > 0) + { + using var deleteObjects = client.DeleteObjectsAsync(multiObjectDeleteRequest); + deleteObjects.Wait(); + + //Console.WriteLine(response.S3Objects.Count + " files deleted"); + } + + request.ContinuationToken = response.NextContinuationToken; + } while (response.IsTruncated == true); + } + + complete = true; + } + catch (Exception ex) + { + Console.Write(" | [Clean] Exception | new attempt | " + attempt); + Console.WriteLine(ex.Message); + if (attempt > 3) + { + throw; + } + } + } + } + #endregion } From d7b63b12f9fc2a815f825f32f65fba762552d41a Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 30 Oct 2024 20:05:01 +0100 Subject: [PATCH 26/37] Implement ValidatePersonIdInSlice --- sources/RunValidation/Program.cs | 18 ++++++++--- sources/RunValidation/Validation.cs | 48 ++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 220682ef..42c7ca0c 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -3,6 +3,7 @@ using org.ohdsi.cdm.framework.common.Enums; using org.ohdsi.cdm.framework.common.Utility; using System.Configuration; +using org.ohdsi.cdm.framework.common.Omop; namespace RunValidation { @@ -25,17 +26,22 @@ internal class Options [Option('c', "chunks", Separator = ',', HelpText = "(Optional) Comma-separated list of chunk IDs to process. All of them, if omitted.")] public IEnumerable Chunks { get; set; } = new List(); + [Option('p', "personId", Default = null, HelpText = "(Optional) If specified, the usual check changes to finding SliceId for the given PersonId within the first specified ChunkId.")] + public long? PersonId { get; set; } = null; + [Usage(ApplicationAlias = "RunValidation")] public static IEnumerable Examples { get { - yield return new Example("Process all chunks", new Options + yield return new Example("Process all chunks of a vendor", new Options { Vendor = "VendorName", BuildingId = 123}); - yield return new Example("Process all chunks for an external .dll", new Options + yield return new Example("Process all vendor's chunks from an external .dll", new Options { Vendor = "ExternalVendorName", BuildingId = 123, EtlLibraryPath = "C:\\PathToExternalDllFolder"}); - yield return new Example("Process specified chunks", new Options + yield return new Example("Process specified chunks of a vendor", new Options { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1, 2, 3 } }); + yield return new Example("Get SliceId within the given vendor's chunk containing the given PersonId", new Options + { Vendor = "VendorName", BuildingId = 123, Chunks = new List { 1 }, PersonId = 123 }); } } } @@ -69,11 +75,15 @@ static void RunWithOptions(Options opts) Console.WriteLine($"EtlLibraryPath: {opts.EtlLibraryPath}"); Console.WriteLine($"LocalTmpPath: {opts.LocalTmpPath}"); Console.WriteLine($"Current directory: {Directory.GetCurrentDirectory()}"); + Console.WriteLine($"PersonId: {opts.PersonId.ToString() ?? ""}"); Console.WriteLine(); Vendor vendor = EtlLibrary.CreateVendorInstance(opts.Vendor, opts.EtlLibraryPath); var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, opts.LocalTmpPath, _cdmFolder); - validation.ValidateBuildingId(vendor, opts.BuildingId, chunks); + if (opts.PersonId.HasValue) + validation.ValidatePersonIdInSlice(vendor, opts.BuildingId, opts.Chunks.First(), opts.PersonId.Value); + else + validation.ValidateBuildingId(vendor, opts.BuildingId, chunks); } static void HandleParseError(IEnumerable errs) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index a74df2ca..3eb956fa 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -13,6 +13,7 @@ using Spectre.Console; using System.Collections.Concurrent; using System.IO; +using System.Diagnostics.Eventing.Reader; namespace RunValidation { @@ -21,7 +22,7 @@ public class Validation(string awsAccessKeyId, string awsSecretAccessKey, string { #region classes - public class PersonInS3Chunk(long PersonId, Vendor Vendor, int BuildingId, int ChunkId, bool IsFromBatch = true) + class PersonInS3Chunk(long PersonId, Vendor Vendor, int BuildingId, int ChunkId, bool IsFromBatch = true) { public long PersonId { get; set; } = PersonId; public Vendor Vendor { get; set; } = Vendor; @@ -102,7 +103,26 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) timer.Restart(); } - + /// + /// Short version to quickly get sliceId for a given personId + /// + /// + /// + /// + /// + public void ValidatePersonIdInSlice(Vendor vendor, int buildingId, int chunkId, long personId) + { + var person = new PersonInS3Chunk(personId, vendor, buildingId, chunkId) { IsFromBatch = false }; + person.SliceId = FindSlice(person, vendor.PersonTableName, vendor.PersonIdIndex); + if (person.SliceId.HasValue) + { + AnsiConsole.MarkupLine($"[green]PersonId {person.PersonId} was found in raw SliceId {person.SliceId}![/]"); + } + else + { + AnsiConsole.MarkupLine($"[red]PersonId {person.PersonId} was not found in raw Vendor {vendor.Name} - BuildingId {buildingId} - ChunkId {chunkId}![/]"); + } + } private Dictionary> GetPersonsByChunkId(Vendor vendor, int buildingId, List chunks, IProgress progress) @@ -221,7 +241,7 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, HashSet foreach (var slice in s3ObjectsBySlice) { - var slicePersonIds = ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.personObjects, slice.Value.metadataObjects); + var slicePersonIds = ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.PersonObjects, slice.Value.MetadataObjects); task.Increment(1); } @@ -236,7 +256,7 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, HashSet inBatchOnlyExample.SliceId = FindSlice(inBatchOnlyExample, vendor.PersonTableName, vendor.PersonIdIndex); var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersonIds.Count} " + - $"| Id Example={inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; + $"| PersonId Example={inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; AnsiConsole.MarkupLine(msg); } @@ -278,7 +298,7 @@ private HashSet GetActualSlices(string vendorName, int buildingId) return slices; } - private Dictionary personObjects, List metadataObjects)> GetS3ObjectsBySlice(Vendor vendor, + private Dictionary PersonObjects, List MetadataObjects)> GetS3ObjectsBySlice(Vendor vendor, int buildingId, int chunkId, List slices2process) { var s3ObjectsBySlice = new Dictionary PersonObjects, List MetadataObjects)>(); @@ -286,23 +306,23 @@ private HashSet GetActualSlices(string vendorName, int buildingId) foreach (var tuple in GetObjects(vendor, buildingId, "PERSON", chunkId, slices2process)) { int sliceId = tuple.Item1; - List personObjects = tuple.Item2; + List PersonObjects = tuple.Item2; if (!s3ObjectsBySlice.ContainsKey(sliceId)) s3ObjectsBySlice[sliceId] = (new List(), new List()); - s3ObjectsBySlice[sliceId].PersonObjects.AddRange(personObjects); + s3ObjectsBySlice[sliceId].PersonObjects.AddRange(PersonObjects); } foreach (var tuple in GetObjects(vendor, buildingId, "METADATA_TMP", chunkId, slices2process)) { int sliceId = tuple.Item1; - List metadataObjects = tuple.Item2; + List MetadataObjects = tuple.Item2; if (!s3ObjectsBySlice.ContainsKey(sliceId)) s3ObjectsBySlice[sliceId] = (new List(), new List()); - s3ObjectsBySlice[sliceId].MetadataObjects.AddRange(metadataObjects); + s3ObjectsBySlice[sliceId].MetadataObjects.AddRange(MetadataObjects); } if (s3ObjectsBySlice.Count == 0) @@ -345,11 +365,11 @@ private IEnumerable>> GetObjects(Vendor vendor, int bu /// /// /// - /// - /// + /// + /// /// Subset of chunkPersonIds for the specified sliceId private HashSet ValidateSliceId(HashSet chunkPersonIds, Vendor vendor, int buildingId, int chunkId, int sliceId, - List personObjects, List metadataObjects) + List PersonObjects, List MetadataObjects) { var attempt = 0; var complete = false; @@ -367,7 +387,7 @@ private HashSet ValidateSliceId(HashSet chunkP var cnt = 0; var attempt1 = attempt; - var allObjects = personObjects.Union(metadataObjects).ToList(); + var allObjects = PersonObjects.Union(MetadataObjects).ToList(); Parallel.ForEach(allObjects, o => { @@ -448,7 +468,7 @@ private HashSet ValidateSliceId(HashSet chunkP } /// - /// + /// Try to get sliceId which contains given PersonId and other parameters /// /// This should have Vendor, BuildingId, ChunkId, and PersonId information /// From 0deeff2c406020a5837a6fd05f626f4ebc0a4aea Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 30 Oct 2024 20:20:01 +0100 Subject: [PATCH 27/37] Output changes --- sources/RunValidation/Validation.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 3eb956fa..bb795ebc 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -256,7 +256,7 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, HashSet inBatchOnlyExample.SliceId = FindSlice(inBatchOnlyExample, vendor.PersonTableName, vendor.PersonIdIndex); var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersonIds.Count} " + - $"| PersonId Example={inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; + $"| Example PersonId = {inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; AnsiConsole.MarkupLine(msg); } @@ -447,7 +447,7 @@ private HashSet ValidateSliceId(HashSet chunkP { var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId={sliceId} " + $"| WrongCount={slicePersonIdsWrongCount.Count}; Duplicates={slicePersonIdsDuplicated.Count} " + - $"| Wrong Person Id Example={slicePersonIdsWrongCount.First().PersonId}[/]"; + $"| Example Wrong Person Id = {slicePersonIdsWrongCount.First().PersonId}[/]"; AnsiConsole.MarkupLine(msg); } From c229bee5991d713cd9cefc3d59fb5b60ca068456 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 31 Oct 2024 00:08:42 +0100 Subject: [PATCH 28/37] Fixes --- sources/RunValidation/Validation.cs | 107 ++++++++++++++-------------- 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index bb795ebc..f44c1797 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -63,6 +63,11 @@ public override string ToString() private readonly LambdaUtility _lambdaUtility = new LambdaUtility(awsAccessKeyId, awsSecretAccessKey, awsAccessKeyId, awsSecretAccessKey, bucket, bucket, bucket, cdmFolder); + /// + /// >> + /// + private ConcurrentDictionary>> _personsBySliceIdByChunkId; + #endregion @@ -70,6 +75,8 @@ public override string ToString() public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) { + _personsBySliceIdByChunkId = new ConcurrentDictionary>>(); + var _wrong = new List(); var timer = new Stopwatch(); timer.Start(); @@ -77,7 +84,6 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) var actualSlices = GetActualSlices(vendor.Name, buildingId).OrderBy(s => s).ToList(); int chunkCount = 0; - var personsByChunkId = new Dictionary>(); AnsiConsole.Status() .Spinner(Spinner.Known.Dots) @@ -89,14 +95,14 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) ctx.Status($"Getting all chunks... (Chunks obtained: {chunkCount})"); }); - personsByChunkId = GetPersonsByChunkId(vendor, buildingId, chunks, progress); + GetPersonsByChunkId(vendor, buildingId, chunks, progress); }); timer.Stop(); - AnsiConsole.MarkupLine($"[green]Getting all {personsByChunkId.Keys.Count} chunks done. It took {timer.ElapsedMilliseconds / 1000}s[/]"); + AnsiConsole.MarkupLine($"[green]Getting all {_personsBySliceIdByChunkId.Keys.Count} chunks done. It took {timer.ElapsedMilliseconds / 1000}s[/]"); timer.Restart(); - ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices, personsByChunkId); + ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices); timer.Stop(); AnsiConsole.MarkupLine($"[green]Done. Problematic chunks, if any, are described above the chunk progress. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); @@ -112,6 +118,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) /// public void ValidatePersonIdInSlice(Vendor vendor, int buildingId, int chunkId, long personId) { + _personsBySliceIdByChunkId = new ConcurrentDictionary>>(); var person = new PersonInS3Chunk(personId, vendor, buildingId, chunkId) { IsFromBatch = false }; person.SliceId = FindSlice(person, vendor.PersonTableName, vendor.PersonIdIndex); if (person.SliceId.HasValue) @@ -125,9 +132,11 @@ public void ValidatePersonIdInSlice(Vendor vendor, int buildingId, int chunkId, } - private Dictionary> GetPersonsByChunkId(Vendor vendor, int buildingId, List chunks, IProgress progress) - { - var persons = new Dictionary>(); + private void GetPersonsByChunkId(Vendor vendor, int buildingId, List chunks, IProgress progress) + { + _personsBySliceIdByChunkId = new ConcurrentDictionary>>(); + + var prefix = $"{vendor}/{buildingId}/_chunks"; int chunkCount = 0; @@ -170,21 +179,19 @@ private Dictionary> GetPersonsByChunkId(Vendor ven if (chunks.Any() && !chunks.Any(s => s == chunkId)) break; - if (!persons.ContainsKey(chunkId)) - persons.Add(chunkId, new HashSet()); - persons[chunkId].Add(new PersonInS3Chunk(personId, vendor, buildingId, chunkId)); + var chunkIdDictionary = _personsBySliceIdByChunkId.GetOrAdd(chunkId, new ConcurrentDictionary>()); + var sliceIdDictionary = chunkIdDictionary.GetOrAdd(-1, new ConcurrentDictionary()); + var person = sliceIdDictionary.GetOrAdd(personId, new PersonInS3Chunk(personId, vendor, buildingId, chunkId)); line = reader.ReadLine(); } } } - - return persons; } - private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices, Dictionary> personsByChunkId) + private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices) { AnsiConsole.MarkupLine("\n"); @@ -201,7 +208,7 @@ private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List { var tasks = new ConcurrentDictionary(); - foreach (var awsChunkId in personsByChunkId.Keys) + foreach (var awsChunkId in _personsBySliceIdByChunkId.Keys) { if (chunks.Any() && !chunks.Contains(awsChunkId)) { @@ -212,50 +219,49 @@ private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List tasks[awsChunkId] = task; } - Parallel.ForEach(personsByChunkId.Keys.OrderBy(s => s) + Parallel.ForEach(_personsBySliceIdByChunkId.Keys.OrderBy(s => s) , new ParallelOptions {MaxDegreeOfParallelism = Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1 } // leave 1 core for UI and OS , awsChunkId => { - var awsChunkPersonIds = personsByChunkId[awsChunkId]; + var awsChunkPersonIds = _personsBySliceIdByChunkId[awsChunkId]; if (tasks.TryGetValue(awsChunkId, out var task)) { - ValidateChunkIdWithProgress(vendor, buildingId, awsChunkPersonIds, actualSlices, task); + ValidateChunkIdWithProgress(vendor, buildingId, awsChunkId, awsChunkPersonIds, actualSlices, task); } - personsByChunkId[awsChunkId].Clear(); + _personsBySliceIdByChunkId[awsChunkId].Clear(); awsChunkPersonIds = null; GC.Collect(); }); - GC.KeepAlive(personsByChunkId); + GC.KeepAlive(_personsBySliceIdByChunkId); }); } - private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, HashSet chunkPersonIds, List slices, ProgressTask task) + private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, ConcurrentDictionary> chunkPersonIds, List slices, ProgressTask task) { - var chunkId = chunkPersonIds.First().ChunkId; - var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); foreach (var slice in s3ObjectsBySlice) { - var slicePersonIds = ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.PersonObjects, slice.Value.MetadataObjects); + ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.PersonObjects, slice.Value.MetadataObjects); task.Increment(1); } - var inBatchOnlyPersonIds = chunkPersonIds + var inBatchOnlyPersons = chunkPersonIds + .Where(s => s.Key != -1) // this SliceId contains copies from all other SliceId HashSets + .SelectMany(s => s.Value.Values) .Where(s => s.InMetadataFilesCount + s.InPersonFilesCount == 0) .ToHashSet(); - - if (inBatchOnlyPersonIds.Count > 0) + if (inBatchOnlyPersons.Count > 0) { - var inBatchOnlyExample = inBatchOnlyPersonIds.First(); + var inBatchOnlyExample = inBatchOnlyPersons.First(); inBatchOnlyExample.SliceId = FindSlice(inBatchOnlyExample, vendor.PersonTableName, vendor.PersonIdIndex); - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersonIds.Count} " + + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersons.Count} " + $"| Example PersonId = {inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; AnsiConsole.MarkupLine(msg); } @@ -368,7 +374,7 @@ private IEnumerable>> GetObjects(Vendor vendor, int bu /// /// /// Subset of chunkPersonIds for the specified sliceId - private HashSet ValidateSliceId(HashSet chunkPersonIds, Vendor vendor, int buildingId, int chunkId, int sliceId, + private void ValidateSliceId(ConcurrentDictionary> chunkPersonIds, Vendor vendor, int buildingId, int chunkId, int sliceId, List PersonObjects, List MetadataObjects) { var attempt = 0; @@ -382,12 +388,12 @@ private HashSet ValidateSliceId(HashSet chunkP var timer = new Stopwatch(); timer.Start(); - #region chunkPersonIds -> set counts + #region chunkPersonIds -> set counts. After this chunkPersonIds will have a HashSet for each SliceId and all of them are also in Sliceid -1. - var cnt = 0; var attempt1 = attempt; var allObjects = PersonObjects.Union(MetadataObjects).ToList(); + Parallel.ForEach(allObjects, o => { @@ -408,36 +414,35 @@ private HashSet ValidateSliceId(HashSet chunkP while (csv.Read()) { var personId = (long)csv.GetField(typeof(long), 0); - lock (chunkPersonIds) + + var sliceIdDictionary = chunkPersonIds.GetOrAdd(sliceId, new ConcurrentDictionary()); + var personToProcess = sliceIdDictionary.GetOrAdd(personId, chunkPersonIds[-1][personId]); + personToProcess.SliceId ??= sliceId; + + if (o.Key.Contains("PERSON")) { - var localPersonInS3 = new PersonInS3Chunk(personId, vendor, buildingId, chunkId, false); - if (chunkPersonIds.TryGetValue(localPersonInS3, out var actual)) - localPersonInS3 = actual; - else - chunkPersonIds.Add(localPersonInS3); - - localPersonInS3.SliceId ??= sliceId; - - if (o.Key.Contains("PERSON")) - localPersonInS3.InPersonFilesCount++; - else if (o.Key.Contains("METADATA_TMP")) - localPersonInS3.InMetadataFilesCount++; + personToProcess.InPersonFilesCount++; } + else if (o.Key.Contains("METADATA_TMP")) + { + personToProcess.InMetadataFilesCount++; + } + else + throw new NotImplementedException("o.Key=" + o.Key); } - Interlocked.Increment(ref cnt); + }); #endregion - var slicePersonIds = chunkPersonIds - .Where(s => s.SliceId == sliceId) - .ToHashSet(); + if (!chunkPersonIds.TryGetValue(sliceId, out var slicePersonIds)) + return; - var slicePersonIdsDuplicated = slicePersonIds + var slicePersonIdsDuplicated = slicePersonIds.Values .Where(s => s.InPersonFilesCount + s.InMetadataFilesCount > 1) .ToHashSet(); - var slicePersonIdsWrongCount = slicePersonIds + var slicePersonIdsWrongCount = slicePersonIds.Values .Where(s => s.InPersonFilesCount != 1 || s.InMetadataFilesCount != 0 || !s.IsFromBatch) .ToHashSet(); @@ -452,7 +457,6 @@ private HashSet ValidateSliceId(HashSet chunkP } complete = true; - return slicePersonIds; } catch (Exception ex) { @@ -464,7 +468,6 @@ private HashSet ValidateSliceId(HashSet chunkP } } } - return new HashSet(); } /// From a5a997305052ac586d8e6f173839db427a2404b2 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 31 Oct 2024 00:47:06 +0100 Subject: [PATCH 29/37] Fix InBatchOnly calc --- sources/RunValidation/Validation.cs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index f44c1797..c7384864 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -250,18 +250,22 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chu task.Increment(1); } - var inBatchOnlyPersons = chunkPersonIds - .Where(s => s.Key != -1) // this SliceId contains copies from all other SliceId HashSets - .SelectMany(s => s.Value.Values) - .Where(s => s.InMetadataFilesCount + s.InPersonFilesCount == 0) + var personIdsInPersonOrMetadata = chunkPersonIds + .Where(s => s.Key != -1) // Sliceid -1 contains copies from all other SliceId HashSets + .SelectMany(s => s.Value.Keys) + .ToHashSet(); + + var personsInBatchOnly = chunkPersonIds + .First(s => s.Key == -1).Value.Values // Sliceid -1 contains copies from all other SliceId HashSets + .Where(s => !personIdsInPersonOrMetadata.Any(a => a == s.PersonId)) .ToHashSet(); - if (inBatchOnlyPersons.Count > 0) + if (personsInBatchOnly.Count > 0) { - var inBatchOnlyExample = inBatchOnlyPersons.First(); + var inBatchOnlyExample = personsInBatchOnly.First(); inBatchOnlyExample.SliceId = FindSlice(inBatchOnlyExample, vendor.PersonTableName, vendor.PersonIdIndex); - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={inBatchOnlyPersons.Count} " + + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={personsInBatchOnly.Count} " + $"| Example PersonId = {inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; AnsiConsole.MarkupLine(msg); } From 682d4ba55da721def8a556b433b69a52354a73ff Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Thu, 31 Oct 2024 00:59:11 +0100 Subject: [PATCH 30/37] Fix --- sources/RunValidation/Validation.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index c7384864..c307ea5b 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -257,7 +257,7 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chu var personsInBatchOnly = chunkPersonIds .First(s => s.Key == -1).Value.Values // Sliceid -1 contains copies from all other SliceId HashSets - .Where(s => !personIdsInPersonOrMetadata.Any(a => a == s.PersonId)) + .Where(s => !personIdsInPersonOrMetadata.TryGetValue(s.PersonId, out long matchedPersonId)) .ToHashSet(); if (personsInBatchOnly.Count > 0) From 5f28d46ea43b1f5399a12ba7302f3606a97fdb2c Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Tue, 5 Nov 2024 19:04:54 +0100 Subject: [PATCH 31/37] Process chunk right after getting it to avoid excessive memory usage. Hide progress bars after calculation. Optimize getting slices --- sources/RunValidation/Validation.cs | 261 ++++++++++++++-------------- 1 file changed, 128 insertions(+), 133 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index c307ea5b..33285e56 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -63,51 +63,29 @@ public override string ToString() private readonly LambdaUtility _lambdaUtility = new LambdaUtility(awsAccessKeyId, awsSecretAccessKey, awsAccessKeyId, awsSecretAccessKey, bucket, bucket, bucket, cdmFolder); - /// - /// >> - /// - private ConcurrentDictionary>> _personsBySliceIdByChunkId; - + List _dataErrorMessages = new List(); #endregion #region Methods - public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) { - _personsBySliceIdByChunkId = new ConcurrentDictionary>>(); + _dataErrorMessages = new List(); - var _wrong = new List(); - var timer = new Stopwatch(); - timer.Start(); + var timer = Stopwatch.StartNew(); var actualSlices = GetActualSlices(vendor.Name, buildingId).OrderBy(s => s).ToList(); - int chunkCount = 0; + ProcessChunks(vendor, buildingId, chunks, actualSlices); - AnsiConsole.Status() - .Spinner(Spinner.Known.Dots) - .Start("Getting all chunks...", ctx => - { - var progress = new Progress(count => - { - chunkCount = count; - ctx.Status($"Getting all chunks... (Chunks obtained: {chunkCount})"); - }); - - GetPersonsByChunkId(vendor, buildingId, chunks, progress); - }); + foreach (var msg in _dataErrorMessages) + AnsiConsole.MarkupLine(msg); timer.Stop(); - AnsiConsole.MarkupLine($"[green]Getting all {_personsBySliceIdByChunkId.Keys.Count} chunks done. It took {timer.ElapsedMilliseconds / 1000}s[/]"); - timer.Restart(); + AnsiConsole.MarkupLine($"[green]Done. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); + } - ProcessChunksWithProgress(vendor, buildingId, chunks, actualSlices); - timer.Stop(); - AnsiConsole.MarkupLine($"[green]Done. Problematic chunks, if any, are described above the chunk progress. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); - timer.Restart(); - } /// /// Short version to quickly get sliceId for a given personId @@ -118,9 +96,8 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) /// public void ValidatePersonIdInSlice(Vendor vendor, int buildingId, int chunkId, long personId) { - _personsBySliceIdByChunkId = new ConcurrentDictionary>>(); var person = new PersonInS3Chunk(personId, vendor, buildingId, chunkId) { IsFromBatch = false }; - person.SliceId = FindSlice(person, vendor.PersonTableName, vendor.PersonIdIndex); + GetSlicesFromS3(new HashSet() { person }, vendor.PersonTableName, vendor.PersonIdIndex); if (person.SliceId.HasValue) { AnsiConsole.MarkupLine($"[green]PersonId {person.PersonId} was found in raw SliceId {person.SliceId}![/]"); @@ -132,15 +109,9 @@ public void ValidatePersonIdInSlice(Vendor vendor, int buildingId, int chunkId, } - private void GetPersonsByChunkId(Vendor vendor, int buildingId, List chunks, IProgress progress) + private void ProcessChunks(Vendor vendor, int buildingId, List chunks, List slices) { - _personsBySliceIdByChunkId = new ConcurrentDictionary>>(); - - - var prefix = $"{vendor}/{buildingId}/_chunks"; - int chunkCount = 0; - int previousChunk = -1; using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) { @@ -150,96 +121,105 @@ private void GetPersonsByChunkId(Vendor vendor, int buildingId, List chunks Prefix = prefix }; - var response = client.ListObjectsV2Async(request); - response.Wait(); + var s3Objects = client + .ListObjectsV2Async(request) + .GetAwaiter() + .GetResult() + .S3Objects + .OrderBy(s => s.LastModified) + .ToList(); - foreach (var o in response.Result.S3Objects.OrderBy(o => o.LastModified)) - { - using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); - using var responseStream = transferUtility.OpenStream(_bucket, o.Key); - using var bufferedStream = new BufferedStream(responseStream); - using Stream compressedStream = o.Key.EndsWith(".gz") - ? new GZipStream(bufferedStream, CompressionMode.Decompress) - : new DecompressionStream(bufferedStream) //.zst - ; - using var reader = new StreamReader(compressedStream, Encoding.Default); - string? line = reader.ReadLine(); - while (!string.IsNullOrEmpty(line)) - { - var splits = line.Split('\t'); - var chunkId = int.Parse(splits[0]); - var personId = long.Parse(splits[1]); + var chunkIds = s3Objects + .Select(o => int.Parse(o.Key.Split(new[] { "_chunks", ".txt", ".gz", ".zst" }, StringSplitOptions.RemoveEmptyEntries).Last())) + .Distinct() + .ToList(); - if (chunkId != previousChunk) - { - previousChunk = chunkId; - chunkCount++; - progress?.Report(chunkCount); - } - if (chunks.Any() && !chunks.Any(s => s == chunkId)) - break; - - var chunkIdDictionary = _personsBySliceIdByChunkId.GetOrAdd(chunkId, new ConcurrentDictionary>()); - var sliceIdDictionary = chunkIdDictionary.GetOrAdd(-1, new ConcurrentDictionary()); - var person = sliceIdDictionary.GetOrAdd(personId, new PersonInS3Chunk(personId, vendor, buildingId, chunkId)); - - line = reader.ReadLine(); - } + if (chunks.Any()) + { + chunkIds = chunkIds.Where(chunkId => chunks.Contains(chunkId)).ToList(); } - } - } + var totalChunks = chunkIds.Count; + + AnsiConsole.Progress() + .AutoClear(false) + .HideCompleted(true) + .Columns( + new TaskDescriptionColumn(), + new ProgressBarColumn(), + new PercentageColumn(), + new RemainingTimeColumn(), + new SpinnerColumn()) + .Start(ctx => + { + var overallTask = ctx.AddTask("Processing chunks...", maxValue: totalChunks); + var processingTasks = new List(); + int maxDegreeOfParallelism = Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1; + var semaphore = new SemaphoreSlim(maxDegreeOfParallelism); - private void ProcessChunksWithProgress(Vendor vendor, int buildingId, List chunks, List actualSlices) - { - AnsiConsole.MarkupLine("\n"); - - AnsiConsole.Progress() - .AutoClear(false) - .HideCompleted(false) - .Columns( - new TaskDescriptionColumn(), - new ProgressBarColumn(), - new PercentageColumn(), - new RemainingTimeColumn(), - new SpinnerColumn()) - .Start(ctx => - { - var tasks = new ConcurrentDictionary(); - - foreach (var awsChunkId in _personsBySliceIdByChunkId.Keys) - { - if (chunks.Any() && !chunks.Contains(awsChunkId)) + foreach (var s3obj in s3Objects) { - continue; - } + var chunkId = int.Parse(s3obj.Key.Split(new[] { "_chunks", ".txt", ".gz", ".zst" }, StringSplitOptions.RemoveEmptyEntries).Last()); - var task = ctx.AddTask($"Chunk {awsChunkId}", maxValue: actualSlices.Count); - tasks[awsChunkId] = task; - } + if (chunks.Any() && !chunks.Contains(chunkId)) + { + overallTask.Increment(1); + continue; + } - Parallel.ForEach(_personsBySliceIdByChunkId.Keys.OrderBy(s => s) - , new ParallelOptions {MaxDegreeOfParallelism = Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1 } // leave 1 core for UI and OS - , awsChunkId => - { - var awsChunkPersonIds = _personsBySliceIdByChunkId[awsChunkId]; + semaphore.Wait(); + var localChunkId = chunkId; - if (tasks.TryGetValue(awsChunkId, out var task)) - { - ValidateChunkIdWithProgress(vendor, buildingId, awsChunkId, awsChunkPersonIds, actualSlices, task); + var task = Task.Run(() => + { + try + { + var chunkPersonIds = new ConcurrentDictionary>(); + + using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); + using var responseStream = transferUtility.OpenStream(_bucket, s3obj.Key); + using var bufferedStream = new BufferedStream(responseStream); + using Stream compressedStream = s3obj.Key.EndsWith(".gz") + ? new GZipStream(bufferedStream, CompressionMode.Decompress) + : new DecompressionStream(bufferedStream); // .zst + using var reader = new StreamReader(compressedStream, Encoding.Default); + string? line = reader.ReadLine(); + while (!string.IsNullOrEmpty(line)) + { + var splits = line.Split('\t'); + var personId = long.Parse(splits[1]); + + var sliceIdDictionary = chunkPersonIds.GetOrAdd(-1, _ => new ConcurrentDictionary()); + sliceIdDictionary.GetOrAdd(personId, _ => new PersonInS3Chunk(personId, vendor, buildingId, localChunkId)); + + line = reader.ReadLine(); + } + + ValidateChunkIdWithProgress( + vendor, + buildingId, + localChunkId, + chunkPersonIds, + slices, + ctx.AddTask($"Chunk {localChunkId}", maxValue: slices.Count)); + + overallTask.Increment(1); + } + finally + { + semaphore.Release(); + } + }); + + processingTasks.Add(task); } - _personsBySliceIdByChunkId[awsChunkId].Clear(); - awsChunkPersonIds = null; - GC.Collect(); + Task.WaitAll(processingTasks.ToArray()); }); - GC.KeepAlive(_personsBySliceIdByChunkId); - }); + } } - - private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, ConcurrentDictionary> chunkPersonIds, List slices, ProgressTask task) { var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); @@ -259,15 +239,25 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chu .First(s => s.Key == -1).Value.Values // Sliceid -1 contains copies from all other SliceId HashSets .Where(s => !personIdsInPersonOrMetadata.TryGetValue(s.PersonId, out long matchedPersonId)) .ToHashSet(); - - if (personsInBatchOnly.Count > 0) - { - var inBatchOnlyExample = personsInBatchOnly.First(); - inBatchOnlyExample.SliceId = FindSlice(inBatchOnlyExample, vendor.PersonTableName, vendor.PersonIdIndex); - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} | InBatchOnlyPersonIdsCount={personsInBatchOnly.Count} " + - $"| Example PersonId = {inBatchOnlyExample.PersonId}, SliceId = {inBatchOnlyExample.SliceId.ToString() ?? "???"}[/]"; - AnsiConsole.MarkupLine(msg); + if (personsInBatchOnly.Count > 0) + { + //var inBatchOnlyExamples = personsInBatchOnly.Where((s, i) => i == 0 || i % 500 == 0).ToList(); + //GetSlicesFromS3(inBatchOnlyExamples, vendor.PersonTableName, vendor.PersonIdIndex); + GetSlicesFromS3(personsInBatchOnly, vendor.PersonTableName, vendor.PersonIdIndex); + + var slicesToCheck = personsInBatchOnly + .Select(s => s.SliceId) + .Distinct() + .OrderBy(s => s) + .ToList(); + + var example1 = personsInBatchOnly.First(); + + var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId=??? | InBatchOnlyPersonIdsCount={personsInBatchOnly.Count} " + + $"| Example PersonId={example1.PersonId}, Calculalted SliceId={example1.SliceId.ToString() ?? "???"}" + + $"| All slices with missing PersonIds={string.Join(",", slicesToCheck)}[/]"; + _dataErrorMessages.Add(msg); } } @@ -399,7 +389,7 @@ private void ValidateSliceId(ConcurrentDictionary + allObjects.ForEach(o => { using var transferUtility = new TransferUtility(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1); using var responseStream = transferUtility.OpenStream(_bucket, o.Key); @@ -457,7 +447,7 @@ private void ValidateSliceId(ConcurrentDictionaryThis should have Vendor, BuildingId, ChunkId, and PersonId information /// /// - private int? FindSlice(PersonInS3Chunk person, string table, int personIndex) - { - var prefix = $"{person.Vendor.Name}/{person.BuildingId}/raw/{person.ChunkId}/{table}/{table}"; + private void GetSlicesFromS3(HashSet personsOfSingleChunkId, string table, int personIndex) + { + var vendor = personsOfSingleChunkId.First().Vendor; + var buildingId = personsOfSingleChunkId.First().BuildingId; + var chunkId = personsOfSingleChunkId.First().ChunkId; + + var prefix = $"{vendor.Name}/{buildingId}/raw/{chunkId}/{table}/{table}"; using (var client = new AmazonS3Client(_awsAccessKeyId, _awsSecretAccessKey, Amazon.RegionEndpoint.USEast1)) { @@ -510,8 +504,8 @@ private void ValidateSliceId(ConcurrentDictionary !char.IsDigit(s)) .TakeWhile(s => char.IsDigit(s)) .ToArray(); - var sliceId = int.Parse(new string(chars)); - return sliceId; + personProvided.SliceId = int.Parse(new string(chars)); + + if (personsOfSingleChunkId.All(s => s.SliceId.HasValue)) + return; } line = reader.ReadLine(); } } + } } - - return null; } private void Clean(Vendor vendor, int buildingId, int chunkId, string table, int slice) From b8110b57363a94b749cd3230f7dea33baad90d42 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Tue, 12 Nov 2024 10:10:30 +0100 Subject: [PATCH 32/37] Output format --- sources/RunValidation/Validation.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 33285e56..05880776 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -255,7 +255,7 @@ private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chu var example1 = personsInBatchOnly.First(); var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId=??? | InBatchOnlyPersonIdsCount={personsInBatchOnly.Count} " + - $"| Example PersonId={example1.PersonId}, Calculalted SliceId={example1.SliceId.ToString() ?? "???"}" + + $"| Example PersonId={example1.PersonId}, Calculalted SliceId={example1.SliceId.ToString() ?? "???"} " + $"| All slices with missing PersonIds={string.Join(",", slicesToCheck)}[/]"; _dataErrorMessages.Add(msg); } From e1b05543e1ee9307c793c8d7c65d4087665ed2be Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Tue, 12 Nov 2024 14:16:36 +0100 Subject: [PATCH 33/37] Improve validation output structure. Fix some GetHashCode+Equals methods --- .../Common/Enums/Vendor.cs | 18 ++ sources/RunValidation/Validation.cs | 157 +++++++++++++----- 2 files changed, 131 insertions(+), 44 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs index 224a04d6..9d1f625b 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs @@ -25,6 +25,24 @@ public override string ToString() return this.Name; } + public override bool Equals(object? obj) + { + if (obj is not Vendor other || other == null) + return false; + + return this.Name == other.Name; + } + + public override int GetHashCode() + { + unchecked // prevents overflow exceptions + { + int hash = 17; + hash = hash * 23 + this.Name.GetHashCode(); + return hash; + } + } + #endregion } } \ No newline at end of file diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 05880776..58cb2277 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -28,28 +28,59 @@ class PersonInS3Chunk(long PersonId, Vendor Vendor, int BuildingId, int ChunkId, public Vendor Vendor { get; set; } = Vendor; public int BuildingId { get; set; } = BuildingId; public int ChunkId { get; set; } = ChunkId; + public bool IsFromBatch { get; set; } = IsFromBatch; public int? SliceId { get; set; } public int? InPersonFilesCount { get; set; } = 0; public int? InMetadataFilesCount { get; set; } = 0; - public override int GetHashCode() + public override string ToString() { - return Vendor.GetHashCode() ^ BuildingId.GetHashCode() ^ ChunkId.GetHashCode() ^ PersonId.GetHashCode(); //assumming that a single PersonId is never duplicated in a single ChunkId + return $"{Vendor} - {BuildingId} - {ChunkId} - {SliceId?.ToString() ?? "???"} - {PersonId}"; } - + public override bool Equals(object? obj) { - if (obj is not PersonInS3Chunk) + if (obj is not PersonInS3Chunk other || other == null) return false; - return ((PersonInS3Chunk)obj).GetHashCode() == this.GetHashCode(); + + return EqualityComparer.Default.Equals(this.Vendor, other.Vendor) + && BuildingId == other.BuildingId + && ChunkId == other.ChunkId + && PersonId == other.PersonId; } - public override string ToString() + public override int GetHashCode() { - return $"{Vendor} - {BuildingId} - {ChunkId} - {SliceId?.ToString() ?? "???"} - {PersonId}"; + unchecked // prevents overflow exceptions + { + int hash = 17; + hash = hash * 23 + (Vendor != null ? Vendor.GetHashCode() : 0); + hash = hash * 23 + BuildingId.GetHashCode(); + hash = hash * 23 + ChunkId.GetHashCode(); + hash = hash * 23 + PersonId.GetHashCode(); // assuming each PersonId is unique within a ChunkId + return hash; + } } } + + class ChunkReport + { + public int BuildingId { get; set; } + public int ChunkId { get; set; } + public int OnlyInBatchIdsCount { get; set; } + public List AllSlicesWithOnlyInBatchIds { get; set; } = new List(); + public PersonInS3Chunk? ExamplePersonWithCalculatedSlice { get; set; } + public List SliceReports { get; set; } = new List(); + } + + class SliceReport + { + public int SliceId { get; set; } + public int WrongCount { get; set; } + public int Duplicates { get; set; } + public long ExampleWrongPersonId { get; set; } + } #endregion @@ -62,15 +93,13 @@ public override string ToString() private readonly string _cdmFolder = cdmFolder; private readonly LambdaUtility _lambdaUtility = new LambdaUtility(awsAccessKeyId, awsSecretAccessKey, awsAccessKeyId, awsSecretAccessKey, bucket, bucket, bucket, cdmFolder); - - List _dataErrorMessages = new List(); + private readonly List _chunkReports = new List(); #endregion #region Methods public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) { - _dataErrorMessages = new List(); var timer = Stopwatch.StartNew(); @@ -78,8 +107,31 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) ProcessChunks(vendor, buildingId, chunks, actualSlices); - foreach (var msg in _dataErrorMessages) - AnsiConsole.MarkupLine(msg); + foreach (var chunkReport in _chunkReports.OrderBy(c => c.ChunkId)) + { + string msg = $"[red]chunkId - {chunkReport.ChunkId}"; + if (chunkReport.OnlyInBatchIdsCount > 0) + msg += $" | OnlyInBatchIdsCount={chunkReport.OnlyInBatchIdsCount} | " + + $"allSlicesWithOnlyInBatchIds={string.Join(",", chunkReport.AllSlicesWithOnlyInBatchIds)} | " + + $"Example PersonId={chunkReport.ExamplePersonWithCalculatedSlice?.PersonId}, Calculated SliceId={chunkReport.ExamplePersonWithCalculatedSlice?.SliceId}"; + + if (!chunkReport.SliceReports.Any(s => s.WrongCount > 0) && chunkReport.OnlyInBatchIdsCount == 0) + continue; + + AnsiConsole.MarkupLine($"{msg}[/]"); + + foreach (var sliceReport in chunkReport.SliceReports.OrderBy(s => s.SliceId)) + { + if (sliceReport.WrongCount == 0) + continue; + + AnsiConsole.MarkupLine($"[red]\tsliceId - {sliceReport.SliceId} | WrongCount={sliceReport.WrongCount}; Duplicates={sliceReport.Duplicates} | " + + $"Example Wrong Person Id = {sliceReport.ExampleWrongPersonId}[/]"); + } + + AnsiConsole.WriteLine(); + } + timer.Stop(); AnsiConsole.MarkupLine($"[green]Done. Total seconds={timer.ElapsedMilliseconds / 1000}s[/]"); @@ -220,49 +272,62 @@ private void ProcessChunks(Vendor vendor, int buildingId, List chunks, List } } - private void ValidateChunkIdWithProgress(Vendor vendor, int buildingId, int chunkId, ConcurrentDictionary> chunkPersonIds, List slices, ProgressTask task) + private void ValidateChunkIdWithProgress( + Vendor vendor, + int buildingId, + int chunkId, + ConcurrentDictionary> chunkPersonIds, + List slices, + ProgressTask task) { + var chunkReport = new ChunkReport + { + BuildingId = buildingId, + ChunkId = chunkId + }; + var s3ObjectsBySlice = GetS3ObjectsBySlice(vendor, buildingId, chunkId, slices); foreach (var slice in s3ObjectsBySlice) { - ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.PersonObjects, slice.Value.MetadataObjects); + ValidateSliceId(chunkPersonIds, vendor, buildingId, chunkId, slice.Key, slice.Value.PersonObjects, slice.Value.MetadataObjects, chunkReport); task.Increment(1); } var personIdsInPersonOrMetadata = chunkPersonIds - .Where(s => s.Key != -1) // Sliceid -1 contains copies from all other SliceId HashSets + .Where(s => s.Key != -1) // -1 has copies of all the personIds, even without assigned SliceId .SelectMany(s => s.Value.Keys) .ToHashSet(); var personsInBatchOnly = chunkPersonIds - .First(s => s.Key == -1).Value.Values // Sliceid -1 contains copies from all other SliceId HashSets - .Where(s => !personIdsInPersonOrMetadata.TryGetValue(s.PersonId, out long matchedPersonId)) + .First(s => s.Key == -1).Value.Values + .Where(s => !personIdsInPersonOrMetadata.Contains(s.PersonId)) .ToHashSet(); if (personsInBatchOnly.Count > 0) - { - //var inBatchOnlyExamples = personsInBatchOnly.Where((s, i) => i == 0 || i % 500 == 0).ToList(); - //GetSlicesFromS3(inBatchOnlyExamples, vendor.PersonTableName, vendor.PersonIdIndex); + { GetSlicesFromS3(personsInBatchOnly, vendor.PersonTableName, vendor.PersonIdIndex); var slicesToCheck = personsInBatchOnly - .Select(s => s.SliceId) + .Where(s => s.SliceId.HasValue) + .Select(s => s.SliceId!.Value) .Distinct() .OrderBy(s => s) .ToList(); - var example1 = personsInBatchOnly.First(); - - var msg = $"[red]BuildingId={buildingId} ChunkId={chunkId} SliceId=??? | InBatchOnlyPersonIdsCount={personsInBatchOnly.Count} " + - $"| Example PersonId={example1.PersonId}, Calculalted SliceId={example1.SliceId.ToString() ?? "???"} " + - $"| All slices with missing PersonIds={string.Join(",", slicesToCheck)}[/]"; - _dataErrorMessages.Add(msg); + chunkReport.OnlyInBatchIdsCount = personsInBatchOnly.Count; + chunkReport.AllSlicesWithOnlyInBatchIds = slicesToCheck; + chunkReport.ExamplePersonWithCalculatedSlice = personsInBatchOnly.First(); } + lock (_chunkReports) + { + _chunkReports.Add(chunkReport); + } } + private HashSet GetActualSlices(string vendorName, int buildingId) { var slices = new HashSet(); @@ -368,8 +433,15 @@ private IEnumerable>> GetObjects(Vendor vendor, int bu /// /// /// Subset of chunkPersonIds for the specified sliceId - private void ValidateSliceId(ConcurrentDictionary> chunkPersonIds, Vendor vendor, int buildingId, int chunkId, int sliceId, - List PersonObjects, List MetadataObjects) + private void ValidateSliceId( + ConcurrentDictionary> chunkPersonIds, + Vendor vendor, + int buildingId, + int chunkId, + int sliceId, + List PersonObjects, + List MetadataObjects, + ChunkReport chunkReport) { var attempt = 0; var complete = false; @@ -382,12 +454,7 @@ private void ValidateSliceId(ConcurrentDictionary set counts. After this chunkPersonIds will have a HashSet for each SliceId and all of them are also in Sliceid -1. - - var attempt1 = attempt; - var allObjects = PersonObjects.Union(MetadataObjects).ToList(); - allObjects.ForEach(o => { @@ -396,8 +463,7 @@ private void ValidateSliceId(ConcurrentDictionary()); + var sliceIdDictionary = chunkPersonIds.GetOrAdd(sliceId, new ConcurrentDictionary()); var personToProcess = sliceIdDictionary.GetOrAdd(personId, chunkPersonIds[-1][personId]); personToProcess.SliceId ??= sliceId; @@ -424,11 +490,8 @@ private void ValidateSliceId(ConcurrentDictionary /// Try to get sliceId which contains given PersonId and other parameters /// From d38820ecadac9168db5d9a0293ff3c07b172b4c5 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Tue, 12 Nov 2024 14:32:37 +0100 Subject: [PATCH 34/37] Output fixes. Add command for RunLocal --- sources/RunValidation/Validation.cs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 58cb2277..9ab1d5fd 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -109,24 +109,26 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) foreach (var chunkReport in _chunkReports.OrderBy(c => c.ChunkId)) { - string msg = $"[red]chunkId - {chunkReport.ChunkId}"; + string chunkMsg = $"[red]chunkId={chunkReport.ChunkId}"; if (chunkReport.OnlyInBatchIdsCount > 0) - msg += $" | OnlyInBatchIdsCount={chunkReport.OnlyInBatchIdsCount} | " + - $"allSlicesWithOnlyInBatchIds={string.Join(",", chunkReport.AllSlicesWithOnlyInBatchIds)} | " + + chunkMsg += $" | PersonsOnlyInBatch={chunkReport.OnlyInBatchIdsCount} | " + + $"SlicesWithPersonsOnlyInBatch={string.Join(",", chunkReport.AllSlicesWithOnlyInBatchIds)} | " + $"Example PersonId={chunkReport.ExamplePersonWithCalculatedSlice?.PersonId}, Calculated SliceId={chunkReport.ExamplePersonWithCalculatedSlice?.SliceId}"; if (!chunkReport.SliceReports.Any(s => s.WrongCount > 0) && chunkReport.OnlyInBatchIdsCount == 0) continue; - AnsiConsole.MarkupLine($"{msg}[/]"); + AnsiConsole.MarkupLine($"{chunkMsg}[/]"); foreach (var sliceReport in chunkReport.SliceReports.OrderBy(s => s.SliceId)) { if (sliceReport.WrongCount == 0) continue; - - AnsiConsole.MarkupLine($"[red]\tsliceId - {sliceReport.SliceId} | WrongCount={sliceReport.WrongCount}; Duplicates={sliceReport.Duplicates} | " + - $"Example Wrong Person Id = {sliceReport.ExampleWrongPersonId}[/]"); + string sliceMsg = $"[red]\tsliceId={sliceReport.SliceId} " + + $"| WrongCount={sliceReport.WrongCount}; Duplicates={sliceReport.Duplicates} " + + $"| Example Person Id={sliceReport.ExampleWrongPersonId}" + + $"| {vendor.Name} {buildingId} {chunkReport.ChunkId} {sliceReport.SliceId} true"; + AnsiConsole.MarkupLine($"{sliceMsg}[/]"); } AnsiConsole.WriteLine(); From c5b40f56bab34b4edeb819d2afe24c4e1000c3b9 Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Tue, 12 Nov 2024 17:14:25 +0100 Subject: [PATCH 35/37] Alter output --- sources/RunValidation/Validation.cs | 45 ++++++++++++++++------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 9ab1d5fd..75eb5405 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -14,6 +14,7 @@ using System.Collections.Concurrent; using System.IO; using System.Diagnostics.Eventing.Reader; +using org.ohdsi.cdm.framework.common.Omop; namespace RunValidation { @@ -70,12 +71,14 @@ class ChunkReport public int ChunkId { get; set; } public int OnlyInBatchIdsCount { get; set; } public List AllSlicesWithOnlyInBatchIds { get; set; } = new List(); - public PersonInS3Chunk? ExamplePersonWithCalculatedSlice { get; set; } + public List PersonsWithCalculatedSlice { get; set; } = new List(); public List SliceReports { get; set; } = new List(); } class SliceReport { + public int BuildingId { get; set; } + public int ChunkId { get; set; } public int SliceId { get; set; } public int WrongCount { get; set; } public int Duplicates { get; set; } @@ -109,29 +112,29 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) foreach (var chunkReport in _chunkReports.OrderBy(c => c.ChunkId)) { - string chunkMsg = $"[red]chunkId={chunkReport.ChunkId}"; - if (chunkReport.OnlyInBatchIdsCount > 0) - chunkMsg += $" | PersonsOnlyInBatch={chunkReport.OnlyInBatchIdsCount} | " + - $"SlicesWithPersonsOnlyInBatch={string.Join(",", chunkReport.AllSlicesWithOnlyInBatchIds)} | " + - $"Example PersonId={chunkReport.ExamplePersonWithCalculatedSlice?.PersonId}, Calculated SliceId={chunkReport.ExamplePersonWithCalculatedSlice?.SliceId}"; - - if (!chunkReport.SliceReports.Any(s => s.WrongCount > 0) && chunkReport.OnlyInBatchIdsCount == 0) - continue; - - AnsiConsole.MarkupLine($"{chunkMsg}[/]"); + var sliceIdPersons = chunkReport.PersonsWithCalculatedSlice.DistinctBy(s => s.SliceId).ToList(); + foreach (var person in sliceIdPersons) + { + var chunkMsg = $"chunkId={chunkReport.ChunkId}" + + $" sliceId={person.SliceId}" + + $" (personId={person.PersonId})" + + $" | {vendor.Name} {buildingId} {chunkReport.ChunkId} {person.SliceId.ToString()!.PadLeft(4, '0')} true" + + $" | Info: LostPersonCount={chunkReport.PersonsWithCalculatedSlice.Where(s => s.SliceId == person.SliceId).Count()})"; + AnsiConsole.MarkupLine($"[red]{chunkMsg}[/]"); + } foreach (var sliceReport in chunkReport.SliceReports.OrderBy(s => s.SliceId)) { - if (sliceReport.WrongCount == 0) + if (sliceReport.WrongCount == 0 || sliceIdPersons.Any(s => s.SliceId == sliceReport.SliceId)) continue; - string sliceMsg = $"[red]\tsliceId={sliceReport.SliceId} " + - $"| WrongCount={sliceReport.WrongCount}; Duplicates={sliceReport.Duplicates} " + - $"| Example Person Id={sliceReport.ExampleWrongPersonId}" + - $"| {vendor.Name} {buildingId} {chunkReport.ChunkId} {sliceReport.SliceId} true"; - AnsiConsole.MarkupLine($"{sliceMsg}[/]"); - } - AnsiConsole.WriteLine(); + string sliceMsg = $"chunkId={sliceReport.ChunkId}" + + $" sliceId={sliceReport.SliceId}" + + $" (personId={sliceReport.ExampleWrongPersonId})" + + $" | {vendor.Name} {buildingId} {chunkReport.ChunkId} {sliceReport.SliceId.ToString().PadLeft(4, '0')} true" + + $" | Info: Duplicates={sliceReport.Duplicates}"; + AnsiConsole.MarkupLine($"[red]{sliceMsg}[/]"); + } } @@ -319,7 +322,7 @@ private void ValidateChunkIdWithProgress( chunkReport.OnlyInBatchIdsCount = personsInBatchOnly.Count; chunkReport.AllSlicesWithOnlyInBatchIds = slicesToCheck; - chunkReport.ExamplePersonWithCalculatedSlice = personsInBatchOnly.First(); + chunkReport.PersonsWithCalculatedSlice = personsInBatchOnly.ToList(); } lock (_chunkReports) @@ -511,6 +514,8 @@ private void ValidateSliceId( { var sliceReport = new SliceReport { + BuildingId = buildingId, + ChunkId = chunkId, SliceId = sliceId, WrongCount = slicePersonIdsWrongCount.Count, Duplicates = slicePersonIdsDuplicated.Count, From dc485a46b31dc432ec538b9d5da707e89824616d Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Tue, 12 Nov 2024 17:38:28 +0100 Subject: [PATCH 36/37] Fix order. Revert current branch'es ETlLibrary argument input order swap. --- .../org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs | 2 +- .../Desktop/Settings/BuildingSettings.cs | 2 +- sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs | 2 +- .../org.ohdsi.cdm.presentation.lambdabuilder/Function.cs | 2 +- .../org.ohdsi.cdm.presentation.lambdamerge/Function.cs | 2 +- sources/RunValidation/Program.cs | 2 +- sources/RunValidation/Validation.cs | 2 +- sources/Tests/RunETL/Program.cs | 2 +- sources/Tests/RunLocal/Program.cs | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs index d69e3cd3..8cf2f3b3 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Utility/EtlLibrary.cs @@ -111,7 +111,7 @@ public static void LoadVendorSettings(string etlLibraryPath, IVendorSettings set } } - public static Vendor CreateVendorInstance(string name, string etlLibraryPath) + public static Vendor CreateVendorInstance(string etlLibraryPath, string name) { foreach (var assembly in GetETLAssemblies(etlLibraryPath)) { diff --git a/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs b/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs index 2a6c4125..f0a6fe29 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Desktop/Settings/BuildingSettings.cs @@ -214,7 +214,7 @@ private void SetFrom(IDataReader reader) Console.WriteLine("Vendor: " + reader.GetString("Vendor")); Console.WriteLine("EtlLibraryPath: " + EtlLibraryPath); - this.Vendor = EtlLibrary.CreateVendorInstance(reader.GetString("Vendor"), EtlLibraryPath); + this.Vendor = EtlLibrary.CreateVendorInstance(EtlLibraryPath, reader.GetString("Vendor")); SetVendorSettings(); SetVocabularyVersion(); diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs b/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs index 18ab49a7..8160bc75 100644 --- a/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs +++ b/sources/Presentation/org.ohdsi.cdm.presentation.etl2/Program.cs @@ -86,7 +86,7 @@ static int Main(string[] arguments) IConfigurationRoot configuration = builder.Build(); - vendor = EtlLibrary.CreateVendorInstance(vendorName, configuration.GetSection("AppSettings")["etlLibraryPath"]); + vendor = EtlLibrary.CreateVendorInstance(configuration.GetSection("AppSettings")["etlLibraryPath"], vendorName); var builderConnectionString = configuration.GetConnectionString("Builder"); diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs b/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs index b66fe9f4..cefc8a8a 100644 --- a/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs +++ b/sources/Presentation/org.ohdsi.cdm.presentation.lambdabuilder/Function.cs @@ -264,7 +264,7 @@ public async Task FunctionHandler(S3Event evnt, ILambdaContext context) //} var vendorName = s3Event.Object.Key.Split('.')[0].Split('/').Last(); - vendor = EtlLibrary.CreateVendorInstance(vendorName, EtlLibraryPath); + vendor = EtlLibrary.CreateVendorInstance(EtlLibraryPath, vendorName); buildingId = int.Parse(s3Event.Object.Key.Split('.')[1]); _chunkId = int.Parse(s3Event.Object.Key.Split('.')[2]); diff --git a/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs b/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs index 58152957..3389f667 100644 --- a/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs +++ b/sources/Presentation/org.ohdsi.cdm.presentation.lambdamerge/Function.cs @@ -100,7 +100,7 @@ public async Task FunctionHandler(S3Event evnt, ILambdaContext context) try { var vendorName = _s3Event.Object.Key.Split('.')[0].Split('/').Last(); - _settings.Vendor = EtlLibrary.CreateVendorInstance(vendorName, EtlLibraryPath); + _settings.Vendor = EtlLibrary.CreateVendorInstance(EtlLibraryPath, vendorName); _settings.BuildingId = int.Parse(_s3Event.Object.Key.Split('.')[1]); _table = _s3Event.Object.Key.Split('.')[2].Trim(); _subChunkId = int.Parse(_s3Event.Object.Key.Split('.')[3]); diff --git a/sources/RunValidation/Program.cs b/sources/RunValidation/Program.cs index 42c7ca0c..43a86827 100644 --- a/sources/RunValidation/Program.cs +++ b/sources/RunValidation/Program.cs @@ -78,7 +78,7 @@ static void RunWithOptions(Options opts) Console.WriteLine($"PersonId: {opts.PersonId.ToString() ?? ""}"); Console.WriteLine(); - Vendor vendor = EtlLibrary.CreateVendorInstance(opts.Vendor, opts.EtlLibraryPath); + Vendor vendor = EtlLibrary.CreateVendorInstance(opts.EtlLibraryPath, opts.Vendor); var validation = new Validation(_awsAccessKeyId, _awsSecretAccessKey, _bucket, opts.LocalTmpPath, _cdmFolder); if (opts.PersonId.HasValue) validation.ValidatePersonIdInSlice(vendor, opts.BuildingId, opts.Chunks.First(), opts.PersonId.Value); diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 75eb5405..9e3bfa0e 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -112,7 +112,7 @@ public void ValidateBuildingId(Vendor vendor, int buildingId, List chunks) foreach (var chunkReport in _chunkReports.OrderBy(c => c.ChunkId)) { - var sliceIdPersons = chunkReport.PersonsWithCalculatedSlice.DistinctBy(s => s.SliceId).ToList(); + var sliceIdPersons = chunkReport.PersonsWithCalculatedSlice.DistinctBy(s => s.SliceId).OrderBy(s => s.SliceId).ToList(); foreach (var person in sliceIdPersons) { var chunkMsg = $"chunkId={chunkReport.ChunkId}" + diff --git a/sources/Tests/RunETL/Program.cs b/sources/Tests/RunETL/Program.cs index c960b7eb..c26b8c10 100644 --- a/sources/Tests/RunETL/Program.cs +++ b/sources/Tests/RunETL/Program.cs @@ -30,7 +30,7 @@ static void Main(string[] args) { chunkscnt = o.ChunksCnt.Value; slicescnt = o.SlicesCnt.Value; - vendor = EtlLibrary.CreateVendorInstance(o.Vendor, ConfigurationManager.AppSettings["etlLibraryPath"]); + vendor = EtlLibrary.CreateVendorInstance(ConfigurationManager.AppSettings["etlLibraryPath"], o.Vendor); buildingid = o.Buildingid.Value; }); diff --git a/sources/Tests/RunLocal/Program.cs b/sources/Tests/RunLocal/Program.cs index 9e84f38f..3efb76c1 100644 --- a/sources/Tests/RunLocal/Program.cs +++ b/sources/Tests/RunLocal/Program.cs @@ -27,7 +27,7 @@ static void Main(string[] args) Console.WriteLine($"{Directory.GetCurrentDirectory()}"); - Process(EtlLibrary.CreateVendorInstance(args[0], args[5]), int.Parse(args[1]), int.Parse(args[2]), args[3], bool.Parse(args[4]), args[5]); + Process(EtlLibrary.CreateVendorInstance(args[5], args[0]), int.Parse(args[1]), int.Parse(args[2]), args[3], bool.Parse(args[4]), args[5]); Console.WriteLine("DONE"); Console.ReadLine(); From c41276ea570b2f01e41e8bf19dc2124fdfb5345d Mon Sep 17 00:00:00 2001 From: Valeriy Sedov Date: Wed, 13 Nov 2024 11:08:19 +0100 Subject: [PATCH 37/37] Fix GetHashCode --- .../org.ohdsi.cdm.framework/Common/Enums/Vendor.cs | 7 +------ sources/RunValidation/Validation.cs | 13 ++++--------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs b/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs index 9d1f625b..bf30fe05 100644 --- a/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs +++ b/sources/Framework/org.ohdsi.cdm.framework/Common/Enums/Vendor.cs @@ -35,12 +35,7 @@ public override bool Equals(object? obj) public override int GetHashCode() { - unchecked // prevents overflow exceptions - { - int hash = 17; - hash = hash * 23 + this.Name.GetHashCode(); - return hash; - } + return HashCode.Combine(this.Name); } #endregion diff --git a/sources/RunValidation/Validation.cs b/sources/RunValidation/Validation.cs index 9e3bfa0e..f5283f30 100644 --- a/sources/RunValidation/Validation.cs +++ b/sources/RunValidation/Validation.cs @@ -53,15 +53,10 @@ public override bool Equals(object? obj) public override int GetHashCode() { - unchecked // prevents overflow exceptions - { - int hash = 17; - hash = hash * 23 + (Vendor != null ? Vendor.GetHashCode() : 0); - hash = hash * 23 + BuildingId.GetHashCode(); - hash = hash * 23 + ChunkId.GetHashCode(); - hash = hash * 23 + PersonId.GetHashCode(); // assuming each PersonId is unique within a ChunkId - return hash; - } + return HashCode.Combine(Vendor != null ? Vendor.GetHashCode() : 0, + BuildingId, + ChunkId, + PersonId); } }