-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge.py
242 lines (186 loc) · 7.26 KB
/
merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python3
# coding=utf-8
""" Use this Python script to generate one pdf out of all the notes.
The first command line argument will be the filename of the generated pdf
"""
from __future__ import annotations
import json
import os
from collections.abc import Generator
import glob
from itertools import chain
import shutil
import re
def find_filenames(extensions: [], excluded: list[str] = None, exclude_regex: str | re.Pattern = r'(?!)') -> Generator[str]:
"""
:param extensions: File exstensions to find. Should NOT include the . example: ['png', 'webp']
:param excluded: List of filenames to exclude
:param exclude_regex: Exclude files which match the regex. By default, is (?!) which matches nothing.
:return: A generator which generates all the .png and .webp files
Get image file names
"""
if excluded is None:
excluded = []
globs = [chain(glob.iglob(f'*.{extension}'),
glob.iglob(f'*/**.{extension}'),
glob.iglob(f'*/*/**.{extension}')
) for extension in extensions]
found = chain(*globs)
if excluded or exclude_regex != '(?!)': # Only apply filter if we actually have something to filter
exclude_pattern = exclude_regex if isinstance(exclude_regex, re.Pattern) else re.compile(exclude_regex)
def file_filter(f: str) -> bool:
return f not in excluded and not re.search(exclude_pattern, f)
yield from filter(file_filter, found)
else:
yield from found
# These markdown files will not be included in the export.
default_excluded_markdown = [
'Classification.md',
'Data.md',
'Languages.md',
'Prediction.md',
'index.md',
'Semantic-Similarity.md'
]
default_excluded_images = []
default_excluded_markdown.sort() # Sort to make in faster
default_excluded_images.sort()
introduction_file = 'README.md'
default_excluded_markdown.append(introduction_file)
# basename with extension
# basename = os.path.basename(path)
output_header = r"""---
date: \today
fontsize: "11pt"
linestretch: 1.5
inks-as-notes: true
titlepage: true
titlepage-color: "C89343"
titlepage-text-color: "003267"
titlepage-rule-height: 4
logo: "logo.png"
logo-width: 100
page-background-opacity: 5
links-as-notes: false
lot: true
lof: true
listings-disable-line-numbers: true
listings-no-page-break: false
disable-header-and-footer: false
subparagraph: true
lang: "en-UK"
subtitle: "Midterm & Final"
author: "Quinten Cabo - [email protected]"
title: "Computational Linguistics Summary"
link-citations: true
variables:
header-includes: |
\usepackage[Latin]{ucharclasses}
\newfontfamily\fallbackfont{Gentium Plus} % Or whatever font you prefer.
\setTransitionsFor{LatinExtendedAdditional}{\fallbackfont}{\normalfont}
---
Computational Linguistics
\pagebreak
"""
if __name__ == '__main__':
""" To create the export we are going to:
0. Create tmp directory
1. Copy all images to the tmp directory
2. Create output file in tmp directory
3. Load each markdown file into memory one by one
3a. Change all the image links in the markdown files to the current directory
3b Write the markdown file into one file
4. Renamed Pasted images to heading of file
5. Try to generate a .tex file from the markdown with pandoc
6. Try to Generate the pdf from the .tex with pandoc
6b. Copy the pdf and tex out of the tmp directory
7. Delete the tmp dir
"""
import sys
if len(sys.argv) > 1:
output = sys.argv[1]
else:
output = 'CL-Summary-Merged-Quinten'
output_md = output if output.endswith('.md') else output + '.md'
default_excluded_markdown.append(output_md)
tempdir_name = 'EXPORT'
default_excluded_markdown.append(str(os.path.join(tempdir_name, output_md)))
wcd = os.getcwd()
tmp = os.path.join(wcd, tempdir_name)
output_in_tmp = os.path.join(tmp, output_md)
""" Step 0 """
print(f'\033[92mCreating temporary dir:\033[0m {tempdir_name}')
try:
os.mkdir(tmp)
except FileExistsError:
if os.listdir(tmp): # if not os.listdir('/home/varun/temp')
# delete and recreate it, so we are sure its empty
shutil.rmtree(tmp)
os.mkdir(tmp)
""" Step 1 """
print('\033[92mMoving images:\033[0m')
for image_file in find_filenames(['webp', 'png'], default_excluded_images, fr'^{tempdir_name}'):
basename = os.path.basename(image_file)
basename = basename.replace(' ', '_') # image names can not have spaces
print(f'Moved {basename}')
destination = os.path.join(tmp, basename)
shutil.copyfile(image_file, destination)
""" Step 2 """
print(f"\033[92mCreating export markdown file\033[0m: {output_in_tmp}")
with open(output_in_tmp, 'w+') as output_file:
output_file.write(output_header)
with open(introduction_file, 'r') as introduction_file:
output_file.write(introduction_file.read())
output_file.write('\n\n')
""" Step 3 """
print("\033[92mProcessing markdown files:\033[0m")
image_with_spaces_pattern = re.compile(r'(!\[.+]\([.\\/A-z0-9]+)(%20)([A-z 0-9.%]+\))')
figure_finder = re.compile(r'!(\[.{0,}])\(\.\/Pasted_image_[0-9]+\.png\)')
header_finder = re.compile(r'^# {0,}?[^#](.+)')
# Go through all the markdown files
for filename in find_filenames(['md'], default_excluded_markdown):
with open(filename, 'r', encoding='utf-8') as file:
text = file.read()
while re.search(image_with_spaces_pattern, text):
text = re.sub(image_with_spaces_pattern, r'\1_\3', text)
# text = text.replace('.webp', '.png')
text = text.replace(r'.../images', r'../images')
text = text.replace(r'../images/', r'')
text = text.replace(r'.././images', r'')
text = text.replace(r'images/', r'')
header = re.search(header_finder, text)
if hasattr(header, 'group'):
header = header.group()
else:
header = 'No Title'
header = header[2:] # Big hack but I don't care
img_index = 1
for found in re.finditer(figure_finder, text):
for found in found.groups():
# Don't forget to put [] back
text = text.replace(found, f'[{header} {img_index}]')
img_index += 1
with open(output_in_tmp, 'a+', encoding='utf-8') as outputfile:
outputfile.write(text)
outputfile.write("\n\n") # Add new lines between every file
print('Processed:', filename)
# We now have to go to the export dir to build
os.chdir(tempdir_name)
print(output_in_tmp)
print('\033[92mAttempting to run pandoc to generate .tex file\033[0m')
tex_output = os.path.join(tmp, output + '.tex')
os.system(f'pandoc -s "{output_in_tmp}" -o "{tex_output}"')
print('\033[92mAttempting to run pandoc to generate .pdf file\033[0m')
pdf_output = os.path.join(tmp, output + '.pdf')
r = os.system(f'pandoc -s "{output_in_tmp}" --pdf-engine=xelatex -o "{pdf_output}"')
print('Exit code of pandoc pdf', r)
if r == 0:
print(f'It seems like pandoc had success. You have a pdf now at {pdf_output}')
destination = os.path.join('..', pdf_output)
print('\033[92mAttempting to run pandoc to generate fancy (eisvogel) .pdf file\033[0m')
pdf_output_eisvogel = os.path.join(tmp, output + '-eisvogel' + '.pdf')
r = os.system(f'pandoc {output_md} -o {pdf_output_eisvogel} --columns=50 --citeproc --number-sections --pdf-engine xelatex --toc --dpi=300 --template=eisvogel --strip-comments')
print('Exit code of pandoc with eisvogel is ', r)
if r == 0:
print(f'It seems like pandoc had success. You have a pdf now in {pdf_output_eisvogel}')
# If it doesn't find images you probably did not put anything in ![here]()