Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(errors): Store error metadata for sorting #27157

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions posthog/models/error_tracking/error_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class Status(models.TextChoices):
name = models.TextField(null=True, blank=True)
description = models.TextField(null=True, blank=True)

last_seen = models.DateTimeField(null=True, blank=True, auto_now_add=True)
occurrences = models.BigIntegerField(default=0)

def merge(self, issue_ids: list[str]) -> None:
fingerprints = resolve_fingerprints_for_issues(team_id=self.team.pk, issue_ids=issue_ids)

Expand Down
43 changes: 43 additions & 0 deletions posthog/tasks/error_tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from sentry_sdk import capture_exception
from posthog.models.error_tracking.error_tracking import ErrorTrackingIssue
from posthog.redis import get_client
from django.db.models import F


def populate_error_tracking_issue_metrics():

Check failure on line 7 in posthog/tasks/error_tracking.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Function is missing a return type annotation

Check failure on line 7 in posthog/tasks/error_tracking.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Use "-> None" if function does not return a value
client = get_client()
keys = client.scan_iter(match="issue_metadata:*", count=20)
for key in keys:
# check key is valid
parts = key.split(":")
if len(parts) != 3:
continue

team_id = parts[1]
issue_id = parts[2]

# Fetch data associated with the key
data = client.hgetall(key)
last_seen = data.get(b"last_seen")
occurrences = data.get(b"occurrences")

if not last_seen or not occurrences:
# there should be no case where one is missing
try:
client.delete(key)
except Exception as error:
capture_exception(error)
continue

try:
# update the issue and reset redis key
ErrorTrackingIssue.objects.filter(team=team_id, id=issue_id).update(
last_seen=last_seen, occurrences=F("occurrences") + occurrences
)
# :TRICKY: Resetting the redis key here is prone to race conditions,
# but given we sync later and the data here is not critical, just an estimate for sorting,
# I'm skipping locking and letting this be.
client.delete(key)
except Exception as error:
capture_exception(error)
continue
7 changes: 7 additions & 0 deletions posthog/tasks/scheduled.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
send_org_usage_reports,
start_poll_query_performance,
stop_surveys_reached_target,
populate_error_tracking_issue_metrics,
sync_all_organization_available_product_features,
update_event_partitions,
update_quota_limiting,
Expand Down Expand Up @@ -254,6 +255,12 @@ def setup_periodic_tasks(sender: Celery, **kwargs: Any) -> None:
name="clickhouse clear deleted person data",
)

sender.add_periodic_task(
crontab(minute="*/10"),
populate_error_tracking_issue_metrics.s(),
name="populate error tracking issue metrics",
)

sender.add_periodic_task(
crontab(hour="*/12"),
stop_surveys_reached_target.s(),
Expand Down
7 changes: 7 additions & 0 deletions posthog/tasks/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,3 +918,10 @@ def calculate_external_data_rows_synced() -> None:
pass
else:
capture_external_data_rows_synced()


@shared_task(ignore_result=True)
def populate_error_tracking_issue_metrics() -> None:
from posthog.tasks.error_tracking import populate_error_tracking_issue_metrics

populate_error_tracking_issue_metrics()
47 changes: 41 additions & 6 deletions rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions rust/cymbal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ aws-sdk-s3 = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }
moka = { workspace = true }
redis = { version = "0.27.6", features = [
"tokio-comp",
"cluster",
"cluster-async",
] }

[dev-dependencies]
httpmock = { workspace = true }
Expand Down
4 changes: 4 additions & 0 deletions rust/cymbal/src/app_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub struct AppContext {
pub kafka_consumer: SingleTopicConsumer,
pub kafka_producer: FutureProducer<KafkaContext>,
pub pool: PgPool,
pub redis: Arc<redis::Client>,
pub catalog: Catalog,
pub resolver: Resolver,
pub config: Config,
Expand All @@ -51,6 +52,8 @@ impl AppContext {
let options = PgPoolOptions::new().max_connections(config.max_pg_connections);
let pool = options.connect(&config.database_url).await?;

let redis = Arc::new(redis::Client::open(config.redis_url.clone())?);

let aws_credentials = aws_sdk_s3::config::Credentials::new(
&config.object_storage_access_key_id,
&config.object_storage_secret_access_key,
Expand Down Expand Up @@ -100,6 +103,7 @@ impl AppContext {
kafka_consumer,
kafka_producer,
pool,
redis,
catalog,
resolver,
config: config.clone(),
Expand Down
3 changes: 3 additions & 0 deletions rust/cymbal/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ pub struct Config {
#[envconfig(default = "postgres://posthog:posthog@localhost:5432/posthog")]
pub database_url: String,

#[envconfig(default = "redis://localhost:6379/")]
pub redis_url: String,

// Rust service connect directly to postgres, not via pgbouncer, so we keep this low
#[envconfig(default = "4")]
pub max_pg_connections: u32,
Expand Down
2 changes: 2 additions & 0 deletions rust/cymbal/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ pub enum UnhandledError {
ByteStreamError(#[from] ByteStreamError), // AWS specific bytestream error. Idk
#[error("Unhandled serde error: {0}")]
SerdeError(#[from] serde_json::Error),
#[error("Redis error: {0}")]
RedisError(#[from] redis::RedisError),
}

// These are errors that occur during frame resolution. This excludes e.g. network errors,
Expand Down
38 changes: 38 additions & 0 deletions rust/cymbal/src/issue_resolution.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use std::sync::Arc;

use sqlx::postgres::any::AnyConnectionBackend;
use uuid::Uuid;

use redis::{RedisError, AsyncCommands};
use tracing::error;

use crate::{
error::UnhandledError,
types::{FingerprintedErrProps, OutputErrProps},
Expand Down Expand Up @@ -183,3 +188,36 @@ where

Ok(fingerprinted.to_output(issue_override.issue_id))
}

pub async fn track_issue_metadata(
team_id: i32,
issue_id: Uuid,
redis: Arc<redis::Client>,
// TODO: Confirm timestamp format
timestamp: String,
) -> Result<(), UnhandledError>
{

let mut conn = match redis.get_multiplexed_async_connection().await {
Ok(conn) => conn,
Err(e) => {
error!("Error tracking issue metadata: {:?}", e);
return Ok(());
}
};

let redis_key = format!("issue_metadata:{}:{}", team_id, issue_id);

let res: Result<(), RedisError> = redis::pipe()
.hset(redis_key.clone(), "last_seen", timestamp)
.hincr(redis_key, "occurrences", 1)
.query_async(&mut conn)
.await;

// on error, log the error but don't propagate it
if let Err(e) = res {
error!("Error tracking issue metadata: {:?}", e);
}

Ok(())
}
7 changes: 6 additions & 1 deletion rust/cymbal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use app_context::AppContext;
use common_types::ClickHouseEvent;
use error::{EventError, UnhandledError};
use fingerprinting::generate_fingerprint;
use issue_resolution::resolve_issue;
use issue_resolution::{resolve_issue, track_issue_metadata};
use tracing::warn;
use types::{Exception, RawErrProps, Stacktrace};

Expand Down Expand Up @@ -55,6 +55,11 @@ pub async fn handle_event(

let mut output = resolve_issue(&context.pool, event.team_id, fingerprinted).await?;

// TODO: Tracking issue metadata is currently a best-effort operation, and we don't want to
// fail the event processing if it fails. Also, unclear what should happen on merges with redis storage.
// Right now, I expect this to go out of sync so we should have some nightly celery task to keep it in sync.
track_issue_metadata(event.team_id, output.issue_id, context.redis.clone(), event.timestamp.clone()).await?;

// TODO - I'm not sure we actually want to do this? Maybe junk drawer stuff should end up in clickhouse, and
// be directly queryable by users? Stripping it for now, so it only ends up in postgres
output.strip_frame_junk();
Expand Down
Loading