-
Notifications
You must be signed in to change notification settings - Fork 1
/
anonymize.py
140 lines (124 loc) · 4.48 KB
/
anonymize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = 'andreas starke'
# date: 09.06.2024
import logging
import sys
import codecs
from lxml import etree
import string
import random
import datetime
import time
# ===============================================================================
# logging
# ===============================================================================
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# create console handlers and set level to INFO
ch_stdout = logging.StreamHandler(sys.stdout)
ch_stdout.setLevel(logging.DEBUG)
ch_stderr = logging.StreamHandler(sys.stderr)
ch_stderr.setLevel(logging.ERROR)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch_stdout.setFormatter(formatter)
ch_stderr.setFormatter(formatter)
# add chs to logger
logger.addHandler(ch_stdout)
logger.addHandler(ch_stderr)
DATE_FORMATS = [
"%d.%m.%Y",
"%d-%m-%Y",
"%d/%m/%Y",
"%d %m %Y",
"%Y.%m.%d",
"%Y-%m-%d",
"%Y/%m/%d",
"%Y %m %d",
"%d.%m",
"%d/%m",
"%d %m"
]
VOWELS_LOWERCASE = ["a", "e", "i", "o", "u", "ä", "ü", "ö"]
VOWELS_UPPERCASE = ["A", "E", "I", "O", "U", "Ä", "Ü", "Ö"]
CONSONANTS_LOWERCASE = list(set(string.ascii_lowercase).difference(VOWELS_LOWERCASE)) + ["ß"]
CONSONANTS_UPPERCASE = list(set(string.ascii_uppercase).difference(VOWELS_UPPERCASE))
LETTERS_LOWERCASE = string.ascii_lowercase + "äüöß"
LETTERS_UPPERCASE = string.ascii_uppercase + "ÄÜÖ"
def shuffle(original_value):
"""shuffles the characters in x to random characters
@param original_value: string to be randomized
"""
def randomize(x):
"""transforms a single chr or digit into a random one
@param x: a single character
@return: a single character
"""
if x in LETTERS_LOWERCASE:
if x in VOWELS_LOWERCASE:
return random.choice(VOWELS_LOWERCASE)
else:
return random.choice(CONSONANTS_LOWERCASE)
elif x in LETTERS_UPPERCASE:
if x in VOWELS_UPPERCASE:
return random.choice(VOWELS_UPPERCASE)
else:
return random.choice(CONSONANTS_UPPERCASE)
elif x in string.digits:
return random.choice(string.digits)
else:
return x
def randomize_word(x):
"""shuffles a word into a random word
@param x: a single word
@return: a random word of the "same" type
"""
for this_format in DATE_FORMATS:
try:
datetime.datetime.strptime(this_word, this_format)
d = random.randint(1, int(time.time()))
return datetime.datetime.fromtimestamp(d).strftime(this_format)
except Exception:
pass
new_value = ""
for this_character in x:
new_value += randomize(this_character)
return new_value
if original_value is None:
return None
new_value = []
for this_word in original_value.split(" "):
new_value.append(randomize_word(this_word))
return " ".join(new_value)
def main(xml_file, xpaths_file, mode="shuffle"):
"""transforms values behind the xpaths
@param xpaths_file: filename containing newline separated xpaths
@param mode: how to transform
"""
all_paths = codecs.open(xpaths_file, "r", "utf-8").readlines()
xml_struct = etree.parse(xml_file)
nsmap = xml_struct.getroot().nsmap.copy()
for this_path in all_paths:
this_path = this_path.rstrip("\n")
if this_path == "":
continue
try:
to_anonymize = xml_struct.xpath(this_path, namespaces=nsmap)
except Exception:
pass
for this_element in to_anonymize:
original_value = this_element.text
if mode == "shuffle":
new_value = shuffle(original_value)
this_element.text = new_value
logger.info("{} - {}".format(original_value, new_value))
outFile = open(xml_file, "wb")
# outFile.write('<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">\n')
# Metadaten-XML-Datei schreiben
xml_struct.write(outFile, pretty_print=True, xml_declaration=True, encoding="utf-8")
outFile.close()
if __name__ == '__main__':
main(sys.argv[1], sys.argv[2], sys.argv[3])