diff --git a/Dockerfile.philter b/Dockerfile.philter new file mode 100644 index 0000000..7a8611a --- /dev/null +++ b/Dockerfile.philter @@ -0,0 +1,32 @@ +FROM nvidia/cuda:12.5.0-runtime-ubuntu22.04 + +RUN apt-get update && apt-get -y install openjdk-17-jre + +RUN mkdir -p /opt/philter/ssl \ + && mkdir -p /opt/philter/indexes \ + && mkdir -p /opt/philter/policies + +COPY ./distribution/README.txt /opt/philter/ +COPY ./distribution/LICENSE.txt /opt/philter/ +COPY ./distribution/indexes /opt/philter/indexes/ +COPY ./distribution/policies /opt/philter/policies/ +COPY ./distribution/philter.properties /opt/philter/ + +COPY ./philter-app/target/philter.jar /opt/philter/philter.jar + +RUN keytool -genkeypair -keypass Password123! -dname "CN=philter, O=philter, C=US" -alias philter -keyalg RSA -keysize 4096 -storepass Password123! -storetype PKCS12 -keystore /opt/philter/ssl/philter.p12 -validity 3650 \ + && echo "# SSL certificate settings" | tee -a /opt/philter/philter.properties \ + && echo "server.ssl.key-store-type=PKCS12" | tee -a /opt/philter/philter.properties \ + && echo "server.ssl.key-store=/opt/philter/ssl/philter.p12" | tee -a /opt/philter/philter.properties \ + && echo "server.ssl.key-store-password=Password123!" | tee -a /opt/philter/philter.properties \ + && echo "server.ssl.key-alias=philter" | tee -a /opt/philter/philter.properties \ + && echo "#server.ssl.client-auth=want" | tee -a /opt/philter/philter.properties \ + && echo "#server.ssl.trust-store=" | tee -a /opt/philter/philter.properties \ + && echo "#server.ssl.trust-store-password=" | tee -a /opt/philter/philter.properties + +RUN chmod +x /opt/philter/philter.jar + +EXPOSE 8080 + +WORKDIR /opt/philter +CMD ["java", "-jar", "/opt/philter/philter.jar"] diff --git a/Dockerfile.philter-ner b/Dockerfile.philter-ner new file mode 100644 index 0000000..ee40254 --- /dev/null +++ b/Dockerfile.philter-ner @@ -0,0 +1,29 @@ +FROM nvidia/cuda:12.5.0-runtime-ubuntu22.04 + +RUN apt-get update && apt-get -y install unzip build-essential g++ python3 python3-pip python3-dev python3-testresources + +COPY distribution/requirements.txt /tmp +RUN python3 -m pip --no-cache-dir install -r /tmp/requirements.txt +RUN rm /tmp/requirements.txt + +# Install nltk dependencies. +RUN python3 -c "import nltk; nltk.download('punkt')" + +# Copy the Philter model. +COPY distribution/general-lite-3.0-with-base-model.zip /tmp/ +RUN mkdir -p /opt/philter/models/hub/ +RUN unzip -d /opt/philter/models/hub/ /tmp/general-lite-3.0-with-base-model.zip +RUN mv /opt/philter/models/hub/general-lite-3.0.lens /opt/philter/models/ +RUN rm /tmp/general-lite-3.0-with-base-model.zip + +# Set Hugging Face environment variables to run offline. +ENV HF_HUB_DISABLE_TELEMETRY=1 +ENV HF_HUB_OFFLINE=1 +ENV DO_NOT_TRACK=1 +ENV HF_HOME=/opt/philter/models/ + +COPY distribution/service.py /opt/philter/ + +EXPOSE 18080 + +CMD ["python3", "/opt/philter/service.py"] diff --git a/Dockerfile.philter-ui b/Dockerfile.philter-ui new file mode 100644 index 0000000..416365e --- /dev/null +++ b/Dockerfile.philter-ui @@ -0,0 +1,15 @@ +FROM ubuntu:22.04 + +RUN apt-get update && apt-get -y install openjdk-17-jre + +RUN mkdir -p /opt/philter/ssl + +COPY ./philter-ui/target/philter-ui.jar /opt/philter/philter-ui.jar +COPY ./distribution/philter-ui.properties /opt/philter/philter-ui.properties + +RUN chmod +x /opt/philter/philter-ui.jar + +EXPOSE 9000 + +WORKDIR /opt/philter +CMD ["java", "-jar", "/opt/philter/philter-ui.jar"] diff --git a/distribution/LICENSE.txt b/distribution/LICENSE.txt new file mode 100644 index 0000000..7a4a3ea --- /dev/null +++ b/distribution/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/distribution/README.txt b/distribution/README.txt new file mode 100644 index 0000000..e96dde4 --- /dev/null +++ b/distribution/README.txt @@ -0,0 +1,7 @@ + ___ _ _ _ _ + | _ \ |_ (_) | |_ ___ _ _ + | _/ ' \| | | _/ -_) '_| + |_| |_||_|_|_|\__\___|_| + +Copyright (C) 2024 Philterd, LLC +https://www.philterd.ai | support@philterd.ai diff --git a/distribution/indexes/cities/_0.cfe b/distribution/indexes/cities/_0.cfe new file mode 100644 index 0000000..0eb5e7d Binary files /dev/null and b/distribution/indexes/cities/_0.cfe differ diff --git a/distribution/indexes/cities/_0.cfs b/distribution/indexes/cities/_0.cfs new file mode 100644 index 0000000..6fadaf8 Binary files /dev/null and b/distribution/indexes/cities/_0.cfs differ diff --git a/distribution/indexes/cities/_0.si b/distribution/indexes/cities/_0.si new file mode 100644 index 0000000..f0ea8b5 Binary files /dev/null and b/distribution/indexes/cities/_0.si differ diff --git a/distribution/indexes/cities/segments_2 b/distribution/indexes/cities/segments_2 new file mode 100644 index 0000000..6e95791 Binary files /dev/null and b/distribution/indexes/cities/segments_2 differ diff --git a/distribution/indexes/cities/write.lock b/distribution/indexes/cities/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/indexes/counties/_0.cfe b/distribution/indexes/counties/_0.cfe new file mode 100644 index 0000000..e07ee2a Binary files /dev/null and b/distribution/indexes/counties/_0.cfe differ diff --git a/distribution/indexes/counties/_0.cfs b/distribution/indexes/counties/_0.cfs new file mode 100644 index 0000000..7f0009b Binary files /dev/null and b/distribution/indexes/counties/_0.cfs differ diff --git a/distribution/indexes/counties/_0.si b/distribution/indexes/counties/_0.si new file mode 100644 index 0000000..1e114e9 Binary files /dev/null and b/distribution/indexes/counties/_0.si differ diff --git a/distribution/indexes/counties/segments_1 b/distribution/indexes/counties/segments_1 new file mode 100644 index 0000000..1798fc6 Binary files /dev/null and b/distribution/indexes/counties/segments_1 differ diff --git a/distribution/indexes/counties/segments_2 b/distribution/indexes/counties/segments_2 new file mode 100644 index 0000000..2f38e18 Binary files /dev/null and b/distribution/indexes/counties/segments_2 differ diff --git a/distribution/indexes/counties/write.lock b/distribution/indexes/counties/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/indexes/hospital-abbreviations/_0.cfe b/distribution/indexes/hospital-abbreviations/_0.cfe new file mode 100644 index 0000000..f90f084 Binary files /dev/null and b/distribution/indexes/hospital-abbreviations/_0.cfe differ diff --git a/distribution/indexes/hospital-abbreviations/_0.cfs b/distribution/indexes/hospital-abbreviations/_0.cfs new file mode 100644 index 0000000..23eb2f0 Binary files /dev/null and b/distribution/indexes/hospital-abbreviations/_0.cfs differ diff --git a/distribution/indexes/hospital-abbreviations/_0.si b/distribution/indexes/hospital-abbreviations/_0.si new file mode 100644 index 0000000..1f82ce1 Binary files /dev/null and b/distribution/indexes/hospital-abbreviations/_0.si differ diff --git a/distribution/indexes/hospital-abbreviations/segments_1 b/distribution/indexes/hospital-abbreviations/segments_1 new file mode 100644 index 0000000..640ca8e Binary files /dev/null and b/distribution/indexes/hospital-abbreviations/segments_1 differ diff --git a/distribution/indexes/hospital-abbreviations/segments_2 b/distribution/indexes/hospital-abbreviations/segments_2 new file mode 100644 index 0000000..9eeaca8 Binary files /dev/null and b/distribution/indexes/hospital-abbreviations/segments_2 differ diff --git a/distribution/indexes/hospital-abbreviations/write.lock b/distribution/indexes/hospital-abbreviations/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/indexes/hospitals/_0.cfe b/distribution/indexes/hospitals/_0.cfe new file mode 100644 index 0000000..6e120c5 Binary files /dev/null and b/distribution/indexes/hospitals/_0.cfe differ diff --git a/distribution/indexes/hospitals/_0.cfs b/distribution/indexes/hospitals/_0.cfs new file mode 100644 index 0000000..2b8f7e9 Binary files /dev/null and b/distribution/indexes/hospitals/_0.cfs differ diff --git a/distribution/indexes/hospitals/_0.si b/distribution/indexes/hospitals/_0.si new file mode 100644 index 0000000..5e66e3c Binary files /dev/null and b/distribution/indexes/hospitals/_0.si differ diff --git a/distribution/indexes/hospitals/segments_2 b/distribution/indexes/hospitals/segments_2 new file mode 100644 index 0000000..d584ff1 Binary files /dev/null and b/distribution/indexes/hospitals/segments_2 differ diff --git a/distribution/indexes/hospitals/write.lock b/distribution/indexes/hospitals/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/indexes/names/_0.cfe b/distribution/indexes/names/_0.cfe new file mode 100644 index 0000000..ff9f474 Binary files /dev/null and b/distribution/indexes/names/_0.cfe differ diff --git a/distribution/indexes/names/_0.cfs b/distribution/indexes/names/_0.cfs new file mode 100644 index 0000000..ed4407c Binary files /dev/null and b/distribution/indexes/names/_0.cfs differ diff --git a/distribution/indexes/names/_0.si b/distribution/indexes/names/_0.si new file mode 100644 index 0000000..d72cc7d Binary files /dev/null and b/distribution/indexes/names/_0.si differ diff --git a/distribution/indexes/names/segments_2 b/distribution/indexes/names/segments_2 new file mode 100644 index 0000000..d24e0ef Binary files /dev/null and b/distribution/indexes/names/segments_2 differ diff --git a/distribution/indexes/names/write.lock b/distribution/indexes/names/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/indexes/states/_0.cfe b/distribution/indexes/states/_0.cfe new file mode 100644 index 0000000..fa7f236 Binary files /dev/null and b/distribution/indexes/states/_0.cfe differ diff --git a/distribution/indexes/states/_0.cfs b/distribution/indexes/states/_0.cfs new file mode 100644 index 0000000..f792683 Binary files /dev/null and b/distribution/indexes/states/_0.cfs differ diff --git a/distribution/indexes/states/_0.si b/distribution/indexes/states/_0.si new file mode 100644 index 0000000..dcd837a Binary files /dev/null and b/distribution/indexes/states/_0.si differ diff --git a/distribution/indexes/states/segments_1 b/distribution/indexes/states/segments_1 new file mode 100644 index 0000000..8697a6d Binary files /dev/null and b/distribution/indexes/states/segments_1 differ diff --git a/distribution/indexes/states/segments_2 b/distribution/indexes/states/segments_2 new file mode 100644 index 0000000..87cf6d2 Binary files /dev/null and b/distribution/indexes/states/segments_2 differ diff --git a/distribution/indexes/states/write.lock b/distribution/indexes/states/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/indexes/surnames/_2.fdt b/distribution/indexes/surnames/_2.fdt new file mode 100644 index 0000000..d219060 Binary files /dev/null and b/distribution/indexes/surnames/_2.fdt differ diff --git a/distribution/indexes/surnames/_2.fdx b/distribution/indexes/surnames/_2.fdx new file mode 100644 index 0000000..40a2ae2 Binary files /dev/null and b/distribution/indexes/surnames/_2.fdx differ diff --git a/distribution/indexes/surnames/_2.fnm b/distribution/indexes/surnames/_2.fnm new file mode 100644 index 0000000..dc3cc7b Binary files /dev/null and b/distribution/indexes/surnames/_2.fnm differ diff --git a/distribution/indexes/surnames/_2.si b/distribution/indexes/surnames/_2.si new file mode 100644 index 0000000..30119dd Binary files /dev/null and b/distribution/indexes/surnames/_2.si differ diff --git a/distribution/indexes/surnames/_2_Lucene50_0.doc b/distribution/indexes/surnames/_2_Lucene50_0.doc new file mode 100644 index 0000000..4824725 Binary files /dev/null and b/distribution/indexes/surnames/_2_Lucene50_0.doc differ diff --git a/distribution/indexes/surnames/_2_Lucene50_0.tim b/distribution/indexes/surnames/_2_Lucene50_0.tim new file mode 100644 index 0000000..5d35fc2 Binary files /dev/null and b/distribution/indexes/surnames/_2_Lucene50_0.tim differ diff --git a/distribution/indexes/surnames/_2_Lucene50_0.tip b/distribution/indexes/surnames/_2_Lucene50_0.tip new file mode 100644 index 0000000..56ccec6 Binary files /dev/null and b/distribution/indexes/surnames/_2_Lucene50_0.tip differ diff --git a/distribution/indexes/surnames/segments_2 b/distribution/indexes/surnames/segments_2 new file mode 100644 index 0000000..b09c7a8 Binary files /dev/null and b/distribution/indexes/surnames/segments_2 differ diff --git a/distribution/indexes/surnames/write.lock b/distribution/indexes/surnames/write.lock new file mode 100644 index 0000000..e69de29 diff --git a/distribution/philter.properties b/distribution/philter.properties new file mode 100644 index 0000000..2b629e9 --- /dev/null +++ b/distribution/philter.properties @@ -0,0 +1,32 @@ +# General configuration +server.port=8080 +logging.level.root=INFO +ner.endpoint=http://philter-ner:18080 + +# Span Disambiguation +span.disambiguation.enabled=false + +# Cache Service +cache.redis.enabled=false +cache.redis.host= +cache.redis.port=6379 +cache.redis.auth.token= +cache.redis.ssl=false +cache.redis.truststore= +cache.redis.truststore.password= +cache.redis.keystore= +cache.redis.keystore.password= + +# Policies +policies.directory=./policies/ +policies.s3.bucket= +policies.s3.region=us-east-1 + +# Metrics +metrics.prefix=philter +metrics.step=60 +metrics.hostname= +metrics.prometheus.enabled=true +metrics.jmx.enabled=false +metrics.datadog.enabled=false +metrics.datadog.apikey= diff --git a/distribution/policies/default.json b/distribution/policies/default.json new file mode 100644 index 0000000..7105617 --- /dev/null +++ b/distribution/policies/default.json @@ -0,0 +1,73 @@ +{ + "name": "default", + "ignored": [], + "identifiers": { + "dictionaries": [], + "person": { + "personFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "age": { + "ageFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "creditCard": { + "creditCardFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "date": { + "dateFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "emailAddress": { + "emailAddressFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "ipAddress": { + "ipAddressFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "phoneNumber": { + "phoneNumberFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "ssn": { + "ssnFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "url": { + "urlFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "vin": { + "vinFilterStrategies": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + }, + "zipCode": { + "zipCodeFilterStrategy": [{ + "strategy": "REDACT", + "redactionFormat": "{{{REDACTED-%t}}}" + }] + } + } +} diff --git a/distribution/requirements.txt b/distribution/requirements.txt new file mode 100644 index 0000000..960dfc9 --- /dev/null +++ b/distribution/requirements.txt @@ -0,0 +1,16 @@ +# MAIN PACKAGES +flair==0.12.1 +torch==2.0.1 + +# DEPENDENCIES +Cython==3.0.2 +setuptools==68.1.2 +wheel==0.41.2 +numpy==1.24.4 +textblob==0.17.1 +CherryPy==18.8.0 +nltk==3.8.1 +scipy==1.10.1 + +# SPECIAL DEPENDENCIES +huggingface_hub==0.10.0 diff --git a/distribution/service.py b/distribution/service.py new file mode 100644 index 0000000..37b4886 --- /dev/null +++ b/distribution/service.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 + +from textblob import TextBlob +from flair.models import SequenceTagger +from flair.data import Sentence +import cherrypy +import json +import os + + +class Span: + def __init__(self, text, tag, score, start, end): + self.text = text + self.tag = tag + self.score = score + self.start = start + self.end = end + + +class Response(object): + c = 'none' + d = 'none' + p = 0 + spans = [] + + def __init__(self, c, d, p, spans): + self.c = c + self.d = d + self.p = p + self.spans = spans + + +def obj_dict(obj): + return obj.__dict__ + + +def get_lens_file(): + for f in os.listdir("/opt/philter/models/"): + if f.endswith(".lens"): + print("Using lens file " + f) + return "/opt/philter/models/" + f + + +file = get_lens_file() +model = SequenceTagger.load(file) + + +class PhilterModelService(object): + + @cherrypy.expose + def process(self, c='none', d='none', p=0): + + input = cherrypy.request.body.read().decode('utf-8') + + sentences = [] + + blob = TextBlob(input) + for s in blob.sentences: + sentences.append(Sentence(s.raw)) + + model.predict(sentences) + spans = [] + index = 0 + + for i in sentences: + + start_pos = blob.sentences[index].start_index + + for entity in i.get_spans('ner'): + if entity.tag == 'PER': + p1 = Span(entity.text, entity.tag, entity.score, (entity.start_position + start_pos), (entity.end_position + start_pos)) + spans.append(p1) + + index = index + 1 + + r = Response(c, d, p, spans) + s = json.dumps(r, default=obj_dict) + + return s + + @cherrypy.expose + def status(self): + return "healthy: " + file + + +if __name__ == '__main__': + cherrypy.config.update({'server.socket_host': '0.0.0.0', 'server.socket_port': 18080}) + cherrypy.quickstart(PhilterModelService()) \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..cf7516e --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,32 @@ +services: + philter-ner: + build: + context: ./ + dockerfile: Dockerfile.philter-ner + ports: + - "8000:18080" + networks: + - philter + philter: + depends_on: + - philter-ner + build: + context: ./ + dockerfile: Dockerfile.philter + ports: + - "8080:8080" + networks: + - philter + philter-ui: + depends_on: + - philter-ner + - philter + build: + context: ./ + dockerfile: Dockerfile.philter-ui + ports: + - "9000:9000" + networks: + - philter +networks: + philter: