Skip to content

Commit

Permalink
* fix BasePostContent.ProtoBufBytes might be null that cannot get p…
Browse files Browse the repository at this point in the history
…arsed as protobuf encoding

* adding logging about duration of each forum and its starts
@ `SimplifyImagesInAllReplyContentsWorker.DoWork()`
@ crawler

* rename all variables with type `Stopwatch` from `sw` to `stopwatch` @ ImageBatchConsumingWorker.cs
@ imagePipeline

* round mantissa in interpolation of `processMemory` for logging to two digits @ `TransformEntityWorker.Transform()`
@ shared
@ c#
  • Loading branch information
n0099 committed Jun 11, 2024
1 parent bd038ed commit 30c2970
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 15 deletions.
22 changes: 16 additions & 6 deletions c#/crawler/src/Worker/SimplifyImagesInAllReplyContentsWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,22 @@ public class SimplifyImagesInAllReplyContentsWorker(
{
protected override async Task DoWork(CancellationToken stoppingToken)
{
var stopwatch = new Stopwatch();
stopwatch.Start();
await using var dbDefaultFactory = dbContextDefaultFactory();
var db = dbDefaultFactory.Value();
foreach (var fid in from e in db.Forums select e.Fid)
{
logger.LogInformation("Simplify images in reply contents of fid {} started", fid);
await using var dbFactory = dbContextFactory();
await Transform(
() => dbFactory.Value(fid),
saveByNthEntityCount: 10000,
readingEntity => readingEntity.Pid,
readingEntity =>
{
if (readingEntity.ProtoBufBytes == null)
return new() {Pid = readingEntity.Pid, ProtoBufBytes = null};
var protoBuf = Reply.Parser.ParseFrom(readingEntity.ProtoBufBytes);
ReplyParser.SimplifyImagesInReplyContent(logger, ref protoBuf);
return new() {Pid = readingEntity.Pid, ProtoBufBytes = protoBuf.ToByteArray()};
Expand All @@ -30,13 +35,18 @@ await Transform(
p.IsModified = !ByteArrayEqualityComparer.Instance.Equals(p.OriginalValue, p.CurrentValue);
},
(writingDb, writingEntities) => replyContentImageSaver
.Save(writingDb, writingEntities.Select(e => new ReplyPost
{
Pid = e.Pid,
Content = null!,
ContentsProtoBuf = Reply.Parser.ParseFrom(e.ProtoBufBytes).Content
})),
.Save(writingDb, writingEntities
.Where(e => e.ProtoBufBytes != null)
.Select(e => new ReplyPost
{
Pid = e.Pid,
Content = null!,
ContentsProtoBuf = Reply.Parser.ParseFrom(e.ProtoBufBytes).Content
})),
stoppingToken);
logger.LogInformation("Simplify images in reply contents of fid {} finished after {:F2}s",
fid, stopwatch.Elapsed.TotalSeconds);
stopwatch.Restart();
}
}
}
14 changes: 7 additions & 7 deletions c#/imagePipeline/src/ImageBatchConsumingWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,18 @@ void MarkImagesInReplyAsConsumed

logger.LogTrace("Start to consume {} image(s): [{}]",
imagesWithBytes.Count, string.Join(',', imagesInReply.Select(i => i.ImageId)));
var sw = new Stopwatch();
var stopwatch = new Stopwatch();
void LogStopwatch(string consumerType, IReadOnlyCollection<ImageId> imagesId) =>
logger.LogTrace("Spend {}ms to {} for {} image(s): [{}]",
sw.ElapsedMilliseconds, consumerType, imagesId.Count, string.Join(',', imagesId));
stopwatch.ElapsedMilliseconds, consumerType, imagesId.Count, string.Join(',', imagesId));

void ConsumeConsumer<TImage, TConsumer>(
Expression<Func<ImageInReply, bool>> selector, IReadOnlyCollection<TImage> images,
Func<Owned<TConsumer>> consumerFactory, string consumerType)
where TConsumer : IConsumer<TImage>
{
using var consumer = consumerFactory();
sw.Restart();
stopwatch.Restart();
#pragma warning disable IDE0042 // Deconstruct variable declaration
var imagesId = consumer.Value.Consume(db, images, stoppingToken);
#pragma warning restore IDE0042 // Deconstruct variable declaration
Expand Down Expand Up @@ -262,20 +262,20 @@ async Task<IEnumerable<ImageOcrLine>> ConsumeByFidWithScript(
var ocrConsumer = consumerFactory.Value(script);
await ocrConsumer.InitializePaddleOcr(stoppingToken);

var sw = new Stopwatch();
sw.Start();
var stopwatch = new Stopwatch();
stopwatch.Start();
#pragma warning disable IDE0042 // Deconstruct variable declaration
var imagesId = ocrConsumer.Consume(db, imagesInCurrentFid, stoppingToken);
#pragma warning restore IDE0042 // Deconstruct variable declaration
sw.Stop();
stopwatch.Stop();
markImageInReplyAsConsumed(imagesId.Consumed);

var failed = imagesId.Failed.ToList();
if (failed.Count != 0)
logger.LogError("Failed to detect and recognize {} script text for fid {} in {} image(s): [{}]",
script, fid, failed.Count, string.Join(',', failed));
logger.LogTrace("Spend {}ms to detect and recognize {} script text for fid {} in {} image(s): [{}]",
sw.ElapsedMilliseconds, script, fid, imagesInCurrentFid.Count,
stopwatch.ElapsedMilliseconds, script, fid, imagesInCurrentFid.Count,
string.Join(',', imagesInCurrentFid.Select(i => i.ImageId)));

return ocrConsumer.RecognizedTextLines;
Expand Down
5 changes: 3 additions & 2 deletions c#/shared/src/TransformEntityWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async Task SaveThenLog(int processedCount, Process currentProcess)
writingEntities.Clear();
writingDb.ChangeTracker.Clear();

logger.LogTrace("processedEntityCount:{} updatedEntityCount:{} elapsed:{}ms processMemory:{}MiB exceptions:{}",
logger.LogTrace("processedEntityCount:{} updatedEntityCount:{} elapsed:{}ms processMemory:{:F2}MiB exceptions:{}",
processedCount, updatedEntityCount,
stopwatch.ElapsedMilliseconds,
currentProcess.PrivateMemorySize64 / 1024f / 1024,
Expand All @@ -66,7 +66,8 @@ async Task SaveThenLog(int processedCount, Process currentProcess)
foreach (var readingEntity in readingEntities)
{
processedEntityCount++;
if (processedEntityCount % saveByNthEntityCount == 0) await SaveThenLog(processedEntityCount, process);
if (processedEntityCount % saveByNthEntityCount == 0)
await SaveThenLog(processedEntityCount, process);
if (stoppingToken.IsCancellationRequested) break;
try
{
Expand Down

0 comments on commit 30c2970

Please sign in to comment.