Skip to content

Commit

Permalink
Merge branch 'main' into readme-rework
Browse files Browse the repository at this point in the history
  • Loading branch information
Superskyyy authored Dec 13, 2023
2 parents b73b81b + 82d574a commit b7ee1fd
Show file tree
Hide file tree
Showing 8 changed files with 1,429 additions and 1,498 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ on:
pull_request:
push:
branches:
- master
- main
schedule:
- cron: '0 18 * * *'

Expand Down
57 changes: 57 additions & 0 deletions .github/workflows/publish-docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright 2023 SkyAPM org
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: publish-docker

on:
push:
branches:
- main
release:
types:
- released

jobs:
build-docker:
# if: github.repository == 'SkyAPM/R3'
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
timeout-minutes: 120
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: |
ghcr.io/${{ github.repository }}
- name: Build and push Docker images
uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
19 changes: 19 additions & 0 deletions demo/Endpoint100_counterexamples.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/hikaricp/getconnection
/hikaricp/dropcronnection
/api/v1/abcoi
/api/v1/cdeof
/api/vaoica
/api/vaiodjw
/db/getConnection
/db/dropConnection
/api/v1/abcoi
/api/v1/cdeof
/api/vaoica
/api/vaiodjw
/api/v1/usernames/update
/api/v1/usernames/delete
/api/v1/usernames/reformat
/api/v1/users/trump
/api/v1/users/dominator
/api-this-is-a-special-case/v99999/orders/high-similarity-but-update-comes-first-so-rejected/haha/haha/update/123
/api-this-is-a-special-case/v99999/orders/high-similarity-but-update-comes-first-so-rejected/haha/haha/delete/12
3 changes: 2 additions & 1 deletion demo/demo_gradio.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def get_regex(cluster):
["Endpoint200_hard"],
["Endpoint100_trivial_3k_repeat"],
["Endpoint200_hard_3k_repeat"],
['Endpoint100_trivial_500k_perf_bench']
['Endpoint100_trivial_500k_perf_bench'],
['Endpoint100_counterexamples'],
],
)
# Shuffling dataset
Expand Down
66 changes: 31 additions & 35 deletions models/uri_drain/uri_drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,21 +472,45 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
sim_tokens += param_count

ret_val = float(sim_tokens) / len(seq1)

return ret_val, param_count

def create_template(self, seq1, seq2):
# MODIFIED::
assert len(seq1) == len(seq2)
ret_val = list(seq2)
seq_length = len(seq1)

# SPECIAL ASSUMPTION THAT MIGHT BE FALSE::
# /api/getconnection
# /api/dropconnection
if seq_length == 2:
if (seq1[0] == seq2[0] and seq1[1] != seq2[1] # can be simplified
and not self.has_numbers(seq1[1]) and not self.has_numbers(seq2[1])):
print(f'first token match but second token mismatch, seq1 = {seq1}, seq2 = {seq2}')
return 'rejected'
# TODO, radical assumption if there's absolutely 0 digit in seq1 and seq2, then don't consider them similar?
# To implement this, we increase the false negative rate, but decrease false positive rate

for i, (token1, token2) in enumerate(zip(seq1, seq2)):
pre_token = ret_val[i - 1] if i > 0 else None
sub_token = ret_val[i + 1] if i < len(ret_val) - 1 else None
if token1 != token2:
if len(seq1) == 1: # if an uri is of length 1 then it must not share any similarity with any template
if seq_length == 1: # if an uri is of length 1 then it must not share any similarity with any template
return 'rejected'
if seq_length == 2:
# This is to handle cases of
# [0]same_domain/[1]FIRST_ACTUAL_TOKEN or
# here we assume that FIRST_ACTUAL_TOKEN is not a param,
# so we safely reject if they don't match
if i == 1 and '.' in pre_token:
return 'rejected'
if seq_length == 3:
# [0]scheme://[1]same_domain/[2]FIRST_ACTUAL_TOKEN,
if i == 2 and '.' in pre_token:
return 'rejected'
# First handle domains in uri
if i == 1 and ':' in pre_token: # or check : in pre_token
if i == 1 and ':' in pre_token: # it means domain mismatch
# [0]scheme://[1]same_domain/[2]FIRST_ACTUAL_TOKEN,
# self.logger.debug(f'pre_token = {pre_token}, matched cluster = {seq2}')
# TODO Check: handled in similarity check, remove this ^^^
# I assume domain can only appear in either first or second token
Expand Down Expand Up @@ -527,6 +551,8 @@ def create_template(self, seq1, seq2):
tokens of sequence2 = ('api-this-is-a-special-case', 'v99999999999999999', 'orders', 'update',
'123') rejected for content_tokens = ['api-this-is-a-special-case', 'v99999999999999999', 'orders',
'reorder', '122222222222222464643']"""

# ASSUMPTION: There cannot be two consecutive params
if pre_token == self.param_str: # or pre_token in self.possible_params:
# self.logger.debug(f'working on token {token1} and {token2}, index {i}')
# self.logger.debug(f'pre_token {pre_token} is param_str, so current token cannot be a param (assumption)')
Expand All @@ -536,6 +562,7 @@ def create_template(self, seq1, seq2):
# self.logger.debug(f'sub_token {sub_token} is param_str, so current token cannot be a param (assumption)')
# self.logger.debug(f'tokens of sequence2 = {seq2}')
return "rejected"
# ASSUMPTION: A subsequent token to version number cannot be a param
if pre_token is not None and pre_token.startswith(
'v') and pre_token[1:].isdigit():
# self.logger.debug('pre_token is a version number, so current token cannot be a param (assumption)')
Expand All @@ -558,40 +585,9 @@ def create_template(self, seq1, seq2):
# self.logger.debug(f'tokens of sequence2 = {seq2}')
return "rejected"

# REMOVED FOR NOW, not used in SkyWalking
# REPLACED WITH SIMPLE APPROACH
ret_val[i] = self.param_str
# TODO refactor this, now its just for poc and clarity
# if token1.isdigit():
# if ret_val[i] == self.param_extra['INT']:
# continue
# elif ret_val[i] == self.param_extra['STR']:
# ret_val[i] = self.param_extra['VAR']
# elif ret_val[i] == self.param_extra['VAR']:
# continue
# else: # return val is a plain segment, it can be any type, need to check
# # We enforce a cautionary approach.
# ret_val[i] = self.param_extra['INT'] if ret_val[i].isdigit() else self.param_extra['VAR']
# elif token1.isalpha():
# if ret_val[i] == self.param_extra['INT']:
# ret_val[i] = self.param_extra['VAR']
# elif ret_val[i] == self.param_extra['STR']:
# continue
# elif ret_val[i] == self.param_extra['VAR']:
# continue
# else:
# ret_val[i] = self.param_extra['STR'] if ret_val[i].isalpha() else self.param_extra['VAR']
# else:
# if ret_val[i] == self.param_extra['INT']:
# ret_val[i] = self.param_extra['VAR']
# elif ret_val[i] == self.param_extra['STR']:
# ret_val[i] = self.param_extra['VAR']
# elif ret_val[i] == self.param_extra['VAR']:
# continue
# else:
# ret_val[i] = self.param_extra['VAR']
# self.logger.debug(f'After change: {ret_val}')

# self.logger.debug(f'After change: {ret_val}')
return ret_val

def match(self, content: str, full_search_strategy="never"):
Expand Down
Loading

0 comments on commit b7ee1fd

Please sign in to comment.