-
Notifications
You must be signed in to change notification settings - Fork 2
/
TextDealer.py
245 lines (226 loc) · 7.11 KB
/
TextDealer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# -*- coding:utf-8 -*-
"""
@author:SamanthaChen
@file:TextDealer.py
@time:2017/3/219:42
"""
from collections import defaultdict
#######删除文件中重复的行
def deleteDupLine():
'#删除文件中重复的行'
rFile = open("Data/user_tag.dat", "r")
wFile = open("Data/user_tag2.dat", "w")
allLine = rFile.readlines()
rFile.close()
h = {}
for i in allLine:
if not h.has_key(i):
h[i]=1
wFile.write(i)
wFile.close()
####将节点映射成从1开始
def convertStyle():
####将邻接表格式改成一行两个点的格式
rFile = open("Data/lastfm_graph", "r")
wFile = open("DataNew/lastfm_all.edges", "w")
###读文件
d=defaultdict(list)
for line in rFile.readlines():
words=line.split()
d[words[0]]=words[1:]
rFile.close()
###写文件
for key,value in d.items():
for v in value:
s=''+key+'\t'+v
wFile.write(s+'\n')
wFile.close()
def convertStyle2():
####将邻接表格式改成一行两个点的格式
data = 'cora'
rFile = open('L:/ACQData/groundTruthData/' + data + '/' + data + '_nodelabel', "r")
wFile = open("L:/ACQData/LDense/"+data+"_all.featlabels", "w")
###读文件
d=defaultdict(list)
for line in rFile.readlines():
words=line.split()
d[words[0]]=words[1:]
rFile.close()
###写文件
for key,value in d.items():
for v in value:
s=''+key+' '+v #####空格分割
wFile.write(s+'\n')
wFile.close()
def info2onlyNode():
###################将包含core信息和属性信息的文件转换为只有查询节点的文件
rFile = open("L:/ACQData/lastfm_Query_wall.txt", "r")
wFile = open("L:/ACQData/lastfm_Query_wall_onlyNode.txt", "w")
for line in rFile.readlines():
words=line.split('\t')
nodeStartid=words.index('node:') ####查询节点开始的位置
attrsStartid=words.index('attrs:') ####查询属性开始的位置
s=''
for i in words[nodeStartid+1:attrsStartid]:
s += i+'\t'
s.strip('\t')
wFile.write(s+'\n')
wFile.close()
rFile.close()
def info2onlyNode2():
###################将包含core信息和属性信息的文件转换为只有查询节点的文件
data='cora'
rFile = open('L:/ACQData/groundTruthData/'+data+'/'+data+'_query_2Nei_w3_100', "r")
wFile = open('L:/ACQData/cocktail/'+data+'/'+data+'_query_2Nei_w3_100_onlyNode', "w")
for line in rFile.readlines():
words=line.split('\t')
nodeStartid=words.index('node:') ####查询节点开始的位置
attrsStartid=words.index('attr:') ####查询属性开始的位置
s=''
for i in words[nodeStartid+1:attrsStartid]:
s += i+'\t'
s.strip('\t')
wFile.write(s+'\n')
wFile.close()
rFile.close()
def adj2edge():
################将adjList处理成edgeList
rFile = open("L:\ACQData\cocktail\lastfm_graph", "r")
wFile = open("L:\ACQData\cocktail\lastfm_edgeList", "w")
for line in rFile.readlines():
words=line.split()
for adj in words[1:]:
s=''
s+=words[0]+'\t'+adj
wFile.write(s+'\n')
wFile.close()
rFile.close()
def citseer():
'处理citeseer类型的数据'
dataName='WebKB/wisconsin'
path='L:/ACQData/groundTruthData/'
citesFile=path+dataName+'.cites'
contentFile=path+dataName+'.content'
outputGraphFile=path+dataName+'_graph'
outputLabelFile=path+dataName+'_nodelabel'
outputClassFile=path+dataName+'_class'
outputMapFile=path+dataName+'_idMap'
idmap={}
adjDict=defaultdict(list)
labelDict=defaultdict(list)
classDict={} ##最终分类
'1:创建链接表'
citef=open(citesFile,'r')
count=0
for line in citef.readlines():
line=line.strip()
words=line.split()
if not idmap.has_key(words[0]):
idmap[words[0]]=count
count+=1
# print count
if not idmap.has_key(words[1]):
idmap[words[1]]=count
count+=1
# print count
####加入邻接链表
###把引用自己的情况忽略
u=idmap[words[0]]
v=idmap[words[1]]
if u==v:
continue
adjDict[u].append(v)
adjDict[v].append(u)
citef.close()
'1.1:写边文件'
graphw=open(outputGraphFile,'w')
for key,value in adjDict.items():
s=str(key)
for nei in value:
s+=' '+str(nei)
graphw.write(s+'\n')
graphw.close()
'2. 读节点文件和社团文件'
contentf=open(contentFile,'r')
for line in contentf.readlines():
line=line.strip()
words=line.split()
id=idmap[words[0]]
rawAttr=[int(val) for val in words[1:-1]]
classid=words[-1]
###处理属性
for i in range(len(rawAttr)):
if rawAttr[i]==1:
labelDict[id].append(i)
####处理class标签
classDict[id]=classid
'3.写属性文件'
attw=open(outputLabelFile,'w')
for key,value in labelDict.items():
s=str(key)
for a in value:
s+=' '+str(a)
attw.write(s+'\n')
attw.close()
'4.写class文件'
classw=open(outputClassFile,'w')
for key,value in classDict.items():
s=str(key)+' '+value
classw.write(s+'\n')
'5.写map文件'
mapw=open(outputMapFile,'w')
for k,v in idmap.items():
s=k+' '+str(v)
mapw.write(s+'\n')
mapw.close()
def deletNei():
rFile=open('E:\ACQ\Datasets\citeseer_graph','r')
wFile=open('E:\ACQ\Datasets\citeseer_graph_true','w')
allLine = rFile.readlines()
rFile.close()
for line in allLine:
line=line.strip()
words=line.split()
newwords=[]
###找标号小于3327的节点
for si in words:
if int(si)<3279:
newwords.append(si)
##写文件
newline=''
for j in newwords:
newline+=j+' '
newline=newline.strip()
wFile.write(newline+'\n')
wFile.close()
def add0dNode():
path='L:/ACQData/groundTruthData/'
dataName='wisconsin'
rFile=open(path+dataName+'/'+dataName+'_graph','r')
wFile=open(path+dataName+'/'+dataName+'_graph_all','w')
allLine = rFile.readlines()
rFile.close()
###统计原来度>0的节点
oldSet=set()
maxid=0
for line in allLine:
words=line.split()
id=int(words[0])
oldSet.add(id)
if id>maxid:
maxid=id
###找度等于0的节点
zeroNode=[]
for i in range(maxid+1):
if i not in oldSet:
zeroNode.append(i)
###重新输出文件
for line in allLine:
wFile.write(line)
for n in zeroNode:
wFile.write(str(n)+'\n')
wFile.close()
if __name__=='__main__':
# add0dNode()
info2onlyNode2()
# convertStyle2()