Skip to content

Commit

Permalink
feat(errors): Store error metadata for sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
neilkakkar committed Dec 26, 2024
1 parent 34a4f6b commit 8c18e2e
Show file tree
Hide file tree
Showing 11 changed files with 151 additions and 7 deletions.
3 changes: 3 additions & 0 deletions posthog/models/error_tracking/error_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class Status(models.TextChoices):
name = models.TextField(null=True, blank=True)
description = models.TextField(null=True, blank=True)

last_seen = models.DateTimeField(null=True, blank=True, auto_now_add=True)
occurrences = models.BigIntegerField(default=0)

def merge(self, issue_ids: list[str]) -> None:
fingerprints = resolve_fingerprints_for_issues(team_id=self.team.pk, issue_ids=issue_ids)

Expand Down
35 changes: 35 additions & 0 deletions posthog/tasks/error_tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sentry_sdk import capture_exception
from posthog.models.error_tracking.error_tracking import ErrorTrackingIssue
from posthog.redis import get_client
from django.db.models import F


def populate_error_tracking_issue_metrics():

Check failure on line 7 in posthog/tasks/error_tracking.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Function is missing a return type annotation

Check failure on line 7 in posthog/tasks/error_tracking.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Use "-> None" if function does not return a value
client = get_client()
keys = client.scan_iter(match="issue_metadata:*", count=20)
for key in keys:
# check key is valid
parts = key.split(":")
if len(parts) != 3:
continue

team_id = parts[1]
issue_id = parts[2]

# Fetch data associated with the key
data = client.hgetall(key)
last_seen = data.get(b"last_seen")
occurrences = data.get(b"occurrences")

try:
# update the issue and reset redis key
ErrorTrackingIssue.objects.filter(team=team_id, id=issue_id).update(
last_seen=last_seen, occurrences=F("occurrences") + occurrences
)
# :TRICKY: Resetting the redis key here is prone to race conditions,
# but given we sync later and the data here is not critical, just an estimate for sorting,
# I'm skipping locking and letting this be.
client.delete(key)
except Exception as error:
capture_exception(error)
continue
7 changes: 7 additions & 0 deletions posthog/tasks/scheduled.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
send_org_usage_reports,
start_poll_query_performance,
stop_surveys_reached_target,
populate_error_tracking_issue_metrics,
sync_all_organization_available_product_features,
update_event_partitions,
update_quota_limiting,
Expand Down Expand Up @@ -254,6 +255,12 @@ def setup_periodic_tasks(sender: Celery, **kwargs: Any) -> None:
name="clickhouse clear deleted person data",
)

sender.add_periodic_task(
crontab(minute="*/10"),
populate_error_tracking_issue_metrics.s(),
name="populate error tracking issue metrics",
)

sender.add_periodic_task(
crontab(hour="*/12"),
stop_surveys_reached_target.s(),
Expand Down
7 changes: 7 additions & 0 deletions posthog/tasks/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,3 +918,10 @@ def calculate_external_data_rows_synced() -> None:
pass
else:
capture_external_data_rows_synced()


@shared_task(ignore_result=True)
def populate_error_tracking_issue_metrics() -> None:
from posthog.tasks.error_tracking import sync_error_tracking_issue_metrics

Check failure on line 925 in posthog/tasks/tasks.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Module "posthog.tasks.error_tracking" has no attribute "sync_error_tracking_issue_metrics"; maybe "populate_error_tracking_issue_metrics"?

sync_error_tracking_issue_metrics()
47 changes: 41 additions & 6 deletions rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions rust/cymbal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ aws-sdk-s3 = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }
moka = { workspace = true }
redis = { version = "0.27.6", features = [
"tokio-comp",
"cluster",
"cluster-async",
] }

[dev-dependencies]
httpmock = { workspace = true }
Expand Down
4 changes: 4 additions & 0 deletions rust/cymbal/src/app_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub struct AppContext {
pub kafka_consumer: SingleTopicConsumer,
pub kafka_producer: FutureProducer<KafkaContext>,
pub pool: PgPool,
pub redis: Arc<redis::Client>,
pub catalog: Catalog,
pub resolver: Resolver,
pub config: Config,
Expand All @@ -51,6 +52,8 @@ impl AppContext {
let options = PgPoolOptions::new().max_connections(config.max_pg_connections);
let pool = options.connect(&config.database_url).await?;

let redis = Arc::new(redis::Client::open(config.redis_url.clone())?);

let aws_credentials = aws_sdk_s3::config::Credentials::new(
&config.object_storage_access_key_id,
&config.object_storage_secret_access_key,
Expand Down Expand Up @@ -100,6 +103,7 @@ impl AppContext {
kafka_consumer,
kafka_producer,
pool,
redis,
catalog,
resolver,
config: config.clone(),
Expand Down
3 changes: 3 additions & 0 deletions rust/cymbal/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ pub struct Config {
#[envconfig(default = "postgres://posthog:posthog@localhost:5432/posthog")]
pub database_url: String,

#[envconfig(default = "redis://localhost:6379/")]
pub redis_url: String,

// Rust service connect directly to postgres, not via pgbouncer, so we keep this low
#[envconfig(default = "4")]
pub max_pg_connections: u32,
Expand Down
2 changes: 2 additions & 0 deletions rust/cymbal/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ pub enum UnhandledError {
ByteStreamError(#[from] ByteStreamError), // AWS specific bytestream error. Idk
#[error("Unhandled serde error: {0}")]
SerdeError(#[from] serde_json::Error),
#[error("Redis error: {0}")]
RedisError(#[from] redis::RedisError),
}

// These are errors that occur during frame resolution. This excludes e.g. network errors,
Expand Down
38 changes: 38 additions & 0 deletions rust/cymbal/src/issue_resolution.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use std::sync::Arc;

use sqlx::postgres::any::AnyConnectionBackend;
use uuid::Uuid;

use redis::{RedisError, AsyncCommands};
use tracing::error;

use crate::{
error::UnhandledError,
types::{FingerprintedErrProps, OutputErrProps},
Expand Down Expand Up @@ -183,3 +188,36 @@ where

Ok(fingerprinted.to_output(issue_override.issue_id))
}

pub async fn track_issue_metadata(
team_id: i32,
issue_id: Uuid,
redis: Arc<redis::Client>,
// TODO: Confirm timestamp format
timestamp: String,
) -> Result<(), UnhandledError>
{

let mut conn = match redis.get_multiplexed_async_connection().await {
Ok(conn) => conn,
Err(e) => {
error!("Error tracking issue metadata: {:?}", e);
return Ok(());
}
};

let redis_key = format!("issue_metadata:{}:{}", team_id, issue_id);

let res: Result<(), RedisError> = redis::pipe()
.hset(redis_key.clone(), "last_seen", timestamp)
.hincr(redis_key, "occurrences", 1)
.query_async(&mut conn)
.await;

// on error, log the error but don't propagate it
if let Err(e) = res {
error!("Error tracking issue metadata: {:?}", e);
}

Ok(())
}
7 changes: 6 additions & 1 deletion rust/cymbal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use app_context::AppContext;
use common_types::ClickHouseEvent;
use error::{EventError, UnhandledError};
use fingerprinting::generate_fingerprint;
use issue_resolution::resolve_issue;
use issue_resolution::{resolve_issue, track_issue_metadata};
use tracing::warn;
use types::{Exception, RawErrProps, Stacktrace};

Expand Down Expand Up @@ -55,6 +55,11 @@ pub async fn handle_event(

let mut output = resolve_issue(&context.pool, event.team_id, fingerprinted).await?;

// TODO: Tracking issue metadata is currently a best-effort operation, and we don't want to
// fail the event processing if it fails. Also, unclear what should happen on merges with redis storage.
// Right now, I expect this to go out of sync so we should have some nightly celery task to keep it in sync.
track_issue_metadata(event.team_id, output.issue_id, context.redis.clone(), event.timestamp.clone()).await?;

// TODO - I'm not sure we actually want to do this? Maybe junk drawer stuff should end up in clickhouse, and
// be directly queryable by users? Stripping it for now, so it only ends up in postgres
output.strip_frame_junk();
Expand Down

0 comments on commit 8c18e2e

Please sign in to comment.