From 59f7067d33eb18c311635d3f52ec45b0cfb489ad Mon Sep 17 00:00:00 2001 From: n0099 Date: Wed, 10 Jul 2024 20:48:00 +0000 Subject: [PATCH] + abstruct class `BaseUser` to extract fields `(Display)?Name` from entity class `User` * fix false positive of signature existence due to `WHERE signatureId IN (s1, s2) AND xxHash3 IN (x1, x2)` is not equality with `WHERE (signatureId = x1 AND xxHash3 = x1) OR (signatureId = x2 AND xxHash3 = x2)` @ `ReplySignatureSaver.Save()` * fix username of historical anonymous user won't get fallbacked to null when it's emtpy string @ `UserParser.Parse()` * fix outdated comments since a0f48f3e3e55683e35171d7fc3edf28128f86135 @ `UserSaver.ShouldIgnoreEntityRevision()` @ fe --- c#/crawler/src/Db/BaseUser.cs | 7 +++++++ c#/crawler/src/Db/User.cs | 4 +--- .../src/Tieba/Crawl/Parser/UserParser.cs | 2 +- .../src/Tieba/Crawl/Saver/Post/ReplySaver.cs | 2 +- .../Tieba/Crawl/Saver/Post/SubReplySaver.cs | 2 +- .../Tieba/Crawl/Saver/ReplySignatureSaver.cs | 18 ++++++++++-------- c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs | 13 ++----------- 7 files changed, 23 insertions(+), 25 deletions(-) create mode 100644 c#/crawler/src/Db/BaseUser.cs diff --git a/c#/crawler/src/Db/BaseUser.cs b/c#/crawler/src/Db/BaseUser.cs new file mode 100644 index 00000000..bdac2a53 --- /dev/null +++ b/c#/crawler/src/Db/BaseUser.cs @@ -0,0 +1,7 @@ +namespace tbm.Crawler.Db; + +public abstract class BaseUser : TimestampedEntity +{ + public string? Name { get; set; } + public string? DisplayName { get; set; } +} diff --git a/c#/crawler/src/Db/User.cs b/c#/crawler/src/Db/User.cs index 4ce32946..25d5dfce 100644 --- a/c#/crawler/src/Db/User.cs +++ b/c#/crawler/src/Db/User.cs @@ -1,10 +1,8 @@ namespace tbm.Crawler.Db; -public class User : TimestampedEntity +public class User : BaseUser { [Key] public long Uid { get; set; } - public string? Name { get; set; } - public string? DisplayName { get; set; } public required string Portrait { get; set; } public uint? PortraitUpdatedAt { get; set; } public byte? Gender { get; set; } diff --git a/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs b/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs index b9b0ba30..65880e12 100644 --- a/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs +++ b/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs @@ -20,7 +20,7 @@ public void Parse(IEnumerable inUsers) => return new() { Uid = uid, - Name = el.NameShow, + Name = el.NameShow.NullIfEmpty(), Portrait = portrait, PortraitUpdatedAt = portraitUpdatedAt }; diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs index 1df7b72a..7283bc1c 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs @@ -17,7 +17,7 @@ public override SaverChangeSet Save(CrawlerDbContext db) r => new ReplyRevision {TakenAt = r.UpdatedAt ?? r.CreatedAt, Pid = r.Pid}, LinqKit.PredicateBuilder.New(r => Posts.Keys.Contains(r.Pid))); - db.ReplyContents.AddRange(changeSet.NewlyAdded + db.ReplyContents.AddRange(changeSet.NewlyAdded // https://github.com/dotnet/efcore/issues/33945 .Select(r => new ReplyContent {Pid = r.Pid, ProtoBufBytes = r.Content})); PostSaveHandlers += replyContentImageSaver.Save(db, changeSet.NewlyAdded).Invoke; PostSaveHandlers += AuthorRevisionSaver.SaveAuthorExpGradeRevisions(db, changeSet.AllAfter).Invoke; diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs index 339252c1..6116b6db 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs @@ -15,7 +15,7 @@ public override SaverChangeSet Save(CrawlerDbContext db) sr => new SubReplyRevision {TakenAt = sr.UpdatedAt ?? sr.CreatedAt, Spid = sr.Spid}, LinqKit.PredicateBuilder.New(sr => Posts.Keys.Contains(sr.Spid))); - db.SubReplyContents.AddRange(changeSet.NewlyAdded.Select(sr => + db.SubReplyContents.AddRange(changeSet.NewlyAdded.Select(sr => // https://github.com/dotnet/efcore/issues/33945 new SubReplyContent {Spid = sr.Spid, ProtoBufBytes = sr.Content})); PostSaveHandlers += AuthorRevisionSaver.SaveAuthorExpGradeRevisions(db, changeSet.AllAfter).Invoke; diff --git a/c#/crawler/src/Tieba/Crawl/Saver/ReplySignatureSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/ReplySignatureSaver.cs index 7d558e15..4f186de9 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/ReplySignatureSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/ReplySignatureSaver.cs @@ -40,14 +40,16 @@ public Action Save(CrawlerDbContext db, IEnumerable replies) r => r.Signature, SignatureIdAndValueEqualityComparer.Instance); - var existingSignatures = ( - from s in db.ReplySignatures.AsTracking() - where signatures.Select(s2 => s2.SignatureId).Contains(s.SignatureId) - - // server side eval doesn't need ByteArrayEqualityComparer - && signatures.Select(s2 => s2.XxHash3).Contains(s.XxHash3) - select s - ).ToList(); + var existingSignatures = db.ReplySignatures.AsTracking() + .Where(signatures.Aggregate( + LinqKit.PredicateBuilder.New(), + (predicate, newOrExisting) => + predicate.Or(LinqKit.PredicateBuilder + .New(existing => + existing.SignatureId == newOrExisting.SignatureId) + .And(existing => + existing.XxHash3 == newOrExisting.XxHash3)))) + .ToList(); (from existing in existingSignatures join newInReply in signatures on existing.SignatureId equals newInReply.SignatureId select (existing, newInReply)) diff --git a/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs index 9075cb67..973cfade 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs @@ -72,22 +72,13 @@ public partial class UserSaver { protected override bool ShouldIgnoreEntityRevision(string propName, PropertyEntry propEntry, EntityEntry entityEntry) { - // ThreadCrawlFacade.ParseLatestRepliers() will save users with empty string as portrait - // they may soon be updated by (sub) reply crawler after it find out the latest reply + // ThreadCrawlFacade.ParseLatestRepliers() will save partial filled user of latest repliers for livepost thread + // they may later get updated by (sub) reply crawler after it find out the latest reply // so we should ignore its revision update for all fields - // ignore entire record is not possible via IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision() - // since it can only determine one field at the time if (propName != nameof(User.Portrait) || propEntry.OriginalValue is not "") return false; - - // invokes OriginalValues.ToObject() to get a new instance - // since entityInTracking is reference to the changed one var user = (User)entityEntry.OriginalValues.ToObject(); - - // create another user instance with only fields of latest replier filled var latestReplier = User.CreateLatestReplier(user.Uid, user.Name, user.DisplayName); - // if they are same by fields values, the original one is the latest replier - // that previously generated by ParseLatestRepliers() return User.EqualityComparer.Instance.Equals(user, latestReplier); }