Skip to content

Commit

Permalink
Performance improvements to duplicate_ip checking
Browse files Browse the repository at this point in the history
When submitting a survey response, we store a hash of the request ip if available to detect repeated entries and potential abuse. IP addresses are hashed to protect privacy but checking hashes is slow, and becomes slower as the number of records grows. This is a fundamental limitation since we must store IP addresses with unique salts to protect privacy.

To improve performance, we will now keep a maximum of 1000 hash records and also otherwise age them out after 1 week.
  • Loading branch information
underbluewaters committed Nov 9, 2023
1 parent 29aa0d7 commit 1d9eef5
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 69 deletions.
70 changes: 70 additions & 0 deletions packages/api/migrations/committed/000280.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
--! Previous: sha1:6e14c22c9b93c63557203d4e25e5627d0ccf58f6
--! Hash: sha1:6084ad8c19dffb844ed6a49cab24caac6dc51841

-- Enter migration here
alter table survey_response_network_addresses drop column if exists updated_at;
alter table survey_response_network_addresses add column if not exists updated_at integer not null default extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5;

CREATE OR REPLACE FUNCTION public.before_survey_response_insert() RETURNS trigger
LANGUAGE plpgsql SECURITY DEFINER
AS $$
declare
existing survey_response_network_addresses;
begin
if current_setting('session.request_ip', true) is not null then
-- first, delete the oldest hashes of request IPs so that this
-- process doesn't become too inefficient
delete from
survey_response_network_addresses
where
updated_at <= (
select
greatest(
-- Two weeks ago,
(extract(epoch from date_trunc('hour', now())) +
(extract(minute FROM now())::int / 5) * 60 * 5) -
60 * 60 * 24 * 7,
-- Or the thousandth most recent entry updated_at
(
select
updated_at
from
survey_response_network_addresses
order by
updated_at desc
limit 1 offset 1000
)
)
);
update
survey_response_network_addresses
set
num_responses = num_responses + 1,
updated_at = extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5
where
ip_hash = crypt(
current_setting('session.request_ip', true) || NEW.survey_id::text,
ip_hash
)
returning
*
into
existing;
if existing is not null then
NEW.is_duplicate_ip = true;
else
insert into survey_response_network_addresses (
survey_id,
ip_hash
) values (
NEW.survey_id,
crypt(
current_setting('session.request_ip', true) || NEW.survey_id::text,
gen_salt('md5')
)
);
end if;
end if;
return NEW;
end;
$$;
66 changes: 0 additions & 66 deletions packages/api/migrations/current.sql
Original file line number Diff line number Diff line change
@@ -1,67 +1 @@
-- Enter migration here
alter table survey_response_network_addresses drop column if exists updated_at;
alter table survey_response_network_addresses add column if not exists updated_at integer not null default extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5;

CREATE OR REPLACE FUNCTION public.before_survey_response_insert() RETURNS trigger
LANGUAGE plpgsql SECURITY DEFINER
AS $$
declare
existing survey_response_network_addresses;
begin
if current_setting('session.request_ip', true) is not null then
-- first, delete the oldest hashes of request IPs so that this
-- process doesn't become too inefficient
delete from
survey_response_network_addresses
where
updated_at <= (
select
greatest(
-- Two weeks ago,
(extract(epoch from date_trunc('hour', now())) +
(extract(minute FROM now())::int / 5) * 60 * 5) -
60 * 60 * 24 * 7,
-- Or the thousandth most recent entry updated_at
(
select
updated_at
from
survey_response_network_addresses
order by
updated_at desc
limit 1 offset 1000
)
)
);
update
survey_response_network_addresses
set
num_responses = num_responses + 1,
updated_at = extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5
where
ip_hash = crypt(
current_setting('session.request_ip', true) || NEW.survey_id::text,
ip_hash
)
returning
*
into
existing;
if existing is not null then
NEW.is_duplicate_ip = true;
else
insert into survey_response_network_addresses (
survey_id,
ip_hash
) values (
NEW.survey_id,
crypt(
current_setting('session.request_ip', true) || NEW.survey_id::text,
gen_salt('md5')
)
);
end if;
end if;
return NEW;
end;
$$;
40 changes: 37 additions & 3 deletions packages/api/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3512,9 +3512,35 @@ CREATE FUNCTION public.before_survey_response_insert() RETURNS trigger
existing survey_response_network_addresses;
begin
if current_setting('session.request_ip', true) is not null then
-- first, delete the oldest hashes of request IPs so that this
-- process doesn't become too inefficient
delete from
survey_response_network_addresses
where
updated_at <= (
select
greatest(
-- Two weeks ago,
(extract(epoch from date_trunc('hour', now())) +
(extract(minute FROM now())::int / 5) * 60 * 5) -
60 * 60 * 24 * 7,
-- Or the thousandth most recent entry updated_at
(
select
updated_at
from
survey_response_network_addresses
order by
updated_at desc
limit 1 offset 1000
)
)
);
update
survey_response_network_addresses
set num_responses = num_responses + 1
set
num_responses = num_responses + 1,
updated_at = extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5
where
ip_hash = crypt(
current_setting('session.request_ip', true) || NEW.survey_id::text,
Expand All @@ -3534,7 +3560,7 @@ CREATE FUNCTION public.before_survey_response_insert() RETURNS trigger
NEW.survey_id,
crypt(
current_setting('session.request_ip', true) || NEW.survey_id::text,
gen_salt('bf')
gen_salt('md5')
)
);
end if;
Expand Down Expand Up @@ -14518,7 +14544,8 @@ ALTER TABLE public.survey_invites ALTER COLUMN id ADD GENERATED BY DEFAULT AS ID
CREATE TABLE public.survey_response_network_addresses (
survey_id integer NOT NULL,
ip_hash text NOT NULL,
num_responses integer DEFAULT 1 NOT NULL
num_responses integer DEFAULT 1 NOT NULL,
updated_at integer DEFAULT (date_part('epoch'::text, date_trunc('hour'::text, now())) + (((((date_part('minute'::text, now()))::integer / 5) * 60) * 5))::double precision) NOT NULL
);


Expand Down Expand Up @@ -16151,6 +16178,13 @@ CREATE TRIGGER before_survey_delete_trigger BEFORE DELETE ON public.surveys FOR
CREATE TRIGGER before_survey_invited_groups_insert_trigger BEFORE INSERT ON public.survey_invited_groups FOR EACH ROW EXECUTE FUNCTION public.before_survey_invited_groups_insert();


--
-- Name: survey_responses before_survey_response_insert_trigger; Type: TRIGGER; Schema: public; Owner: -
--

CREATE TRIGGER before_survey_response_insert_trigger BEFORE INSERT ON public.survey_responses FOR EACH ROW EXECUTE FUNCTION public.before_survey_response_insert();


--
-- Name: surveys before_survey_update_trigger; Type: TRIGGER; Schema: public; Owner: -
--
Expand Down

0 comments on commit 1d9eef5

Please sign in to comment.