-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmletfns.py
73 lines (72 loc) · 2.46 KB
/
xmletfns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
# coding=UTF-8
from xml.etree import ElementTree as ET
import logsetup
# log=logsetup.getlog(__name__)
import file
import urllib
def readxmltext(text):
return ET.fromstring(text)
def readxml(filename):
tree=ET.parse(filename)
nodes=tree.getroot()
return tree,nodes
def iselement(n):
return isinstance(n,ET.Element)
def prettyprint(node):
log=logsetup.getlog(__name__) #fn is imported as *, no not global log
# This fn is for seeing the Element contents before writing them (in case of
# ElementTree errors that aren't otherwise understandable).
if not isinstance(node,ET.Element):
log.info("didn't prettyprint {}".format(node))
return
t=0
lines=[]
def do(node,t):
line="{}{} {}: {}".format('\t'*t,node.tag,node.attrib,
"" if node.text is None
or set(['\n','\t',' ']).issuperset(node.text)
else str(node.text)+' ('+str(type(node.text))+')'
)
lines.append(line)
log.info(line)
t=t+1
for child in node:
do(child,t)
t=t-1
do(node,t)
return '\n'.join(lines)
def getxmlns(nodes):
xmlns=set()
results=nodes.findall(".")
for r in results:
n=r.get("xmlns:xi")
if n:
xmlns+=n
return xmlns
def iterateforincludes(node,ns,results=[]):
results+=node.findall("xi:include",ns)
# log.info("{} ({}): {}".format(node,len(results),results))
for child in node:
if child: #i.e., has children
iterateforincludes(child,ns,results)
return results
def getincluded(filename,iterated=False):
log=logsetup.getlog(__name__) #fn is imported as *, no not global log
# Each new files starts here
filename=urllib.parse.unquote(str(filename), encoding='utf-8', errors='replace')
# log.info("Looking at filename {}".format(filename))
t,n=readxml(filename)
# ns=getxmlns(n)
dir=str(file.getfilenamedir(filename))
# log.info("In filename dir {}".format(dir))
ns={'xi':"http://www.w3.org/2001/XInclude"}
results=iterateforincludes(n,ns,[])
# log.info("{}: {}".format(len(results),results))
for r in results:
r=file.getdiredrelURL(dir,r.get('href'))
log.info("Found reference to filename {}".format(r))
getincluded(r)
if __name__ == '__main__':
f='/home/kentr/Assignment/Production/CACBLD/Emic First Phonology_paper.xml'
getincluded(f)