-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdddg.py
145 lines (107 loc) · 4.27 KB
/
dddg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Holger Bast"
__email__ = "holgerbast at gmx de"
__copyright__ = "Copyright 2018"
__license__ = "MIT"
__version__ = "0"
__status__ = "Development"
import random
import hashlib
import numpy
from PIL import Image
from faker import Faker
from lxml import etree
from lxml.builder import E
from progressbar import progressbar
out = './example/' # export folder for images and xml files
STRUCTURE_DEPTH = 5
MAXCNT_CHAPTER = 5
MAXCNT_SECTION = 5
MAXCNT_PARAGRAPH = 3
MAXCNT_IMAGE = 6
MAXCNT_IMAGELINK = 1000
fake = Faker(locale="de_DE")
def gen_heading_name():
global fake
return fake.sentence(nb_words=5).replace('.', '')
def gen_paragraph_text():
global fake
return fake.text()
def gen_paragraph(parent):
for i in range(random.randint(1, MAXCNT_PARAGRAPH)):
paragraph = E.para(gen_paragraph_text())
parent.append(paragraph)
return
def gen_section(parent, depth):
if depth >= STRUCTURE_DEPTH:
return
for i in range(random.randint(1, MAXCNT_SECTION)):
section = E.section(
E.title(gen_heading_name())
)
parent.append(section)
gen_paragraph(section)
gen_section(section, depth+1)
return
# DocBook Namespaces
ns0_NAMESPACE="http://docbook.org/ns/docbook"
ns0 = "{%s}" % ns0_NAMESPACE
ns1_NAMESPACE="http://www.w3.org/1999/xlink" # xlink
ns1 = "{%s}" % ns1_NAMESPACE
ns2_NAMESPACE="http://www.w3.org/2001/XInclude" # xi
ns2 = "{%s}" % ns2_NAMESPACE
NSMAP = {None: ns0_NAMESPACE, 'xlink': ns1_NAMESPACE, 'xi': ns2_NAMESPACE}
print('DocBook Dummy Document Generator')
print('--------------------------------')
# Generate Root-Element
document = etree.Element(ns0 + "book", nsmap=NSMAP)
# Generate Document-Structure
for i in progressbar(range(0, MAXCNT_CHAPTER), prefix='Document Structure: '):
chapter = E.chapter(
E.title(gen_heading_name())
)
# print(etree.tostring(chapter, pretty_print=True, method='xml', encoding=str))
document.append(chapter)
gen_section(chapter, 0)
# Adding Images
paragraphs = document.xpath('//para')
images = []
# for para in random.sample(range(0, len(paragraphs)-1), MAXCNT_IMAGE):
for para in progressbar(random.sample(range(0, len(paragraphs)-1), MAXCNT_IMAGE), prefix='Images: '):
image_id = hashlib.md5(str(para).encode('utf-8')).hexdigest()
rgb_array = numpy.random.rand(200, 300, 3) * 255
image = Image.fromarray(rgb_array.astype('uint8')).convert('RGB')
filename = image_id + '.jpg'
image.save(out + filename)
figure = E.figure(
E.title(gen_heading_name()),
E.mediaobject(
E.alt(gen_heading_name()),
E.imageobject(
E.imagedata(fileref=filename)
)
),
id=image_id)
paragraphs[para].addnext(figure)
images.append(image_id)
# Adding Links to Images
for para in progressbar([random.randint(0, len(paragraphs)-1) for _ in range(MAXCNT_IMAGELINK)], prefix="Linking Images: "):
pos = random.randint(0, len(paragraphs[para].text.split())-1) # choose a random word of para for inserting link
img = random.randint(0, len(images)-1) # choose a random image as destination
word = paragraphs[para].text.split()[pos]
start_index = paragraphs[para].text.find(word) # find the position index of the word inside the string
if start_index is None: # word not found inside the string
continue # skip
end_index = start_index + len(word) # end position (index) of the word inside the string
para_head = paragraphs[para].text[:end_index] + ' ' # first part of the paragraph
para_tail = paragraphs[para].text[end_index:] # tail of the paragraph
xref = etree.SubElement(paragraphs[para], "xref", linkend=images[img])
paragraphs[para].append(xref) # inserting link in the paragraph (will be placed at the end)
paragraphs[para].text = para_head # replacing the text of the paragraph (head)
xref.tail = para_tail # adding the tail after the link
# Pretty Print
# print(etree.tostring(document, pretty_print=True, method='xml', encoding=str))
# Save output
tree = etree.ElementTree(document)
tree.write(out + 'output.xml', pretty_print=True, xml_declaration=True, encoding="utf-8")