-
Notifications
You must be signed in to change notification settings - Fork 3
/
inputDataChiTime_converter.py
69 lines (53 loc) · 2.77 KB
/
inputDataChiTime_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
import numpy as np
import os
# This is so that we can use graphchi with our data, for our predicitions
# So this is so that we can feed in the .csv files that we already had into the
# Matrix Market Exchange Formats
# https://math.nist.gov/MatrixMarket/formats.html
# https://people.sc.fsu.edu/~jburkardt/data/mm/mm.html
# prepend to the file
# https://www.quora.com/How-can-I-write-text-in-the-first-line-of-an-existing-file-using-Python
def prependEntries(fileName, string1, string2):
with open(fileName, 'r+') as f:
file_data = f.read()
f.seek(0,0)
f.write(string1.rstrip('\r\n') + '\n' + string2.rstrip('\r\n') + '\n' + file_data)
# wrote this function knowing what was inside of our files with our columns
def fileToMatrixMarket_MU(fileName, label, maxUsers, maxMovies, maxTime):
# MU data
print('Loading data', label,'mu...')
df = pd.read_csv(os.path.join('data', fileName))
# modify data fram to get rid of data we're not using
del df['Unnamed: 0']
#del df['bin']
del df['Date Number']
df = df.astype('int32')
# assume that our first read in file has the number of users and movies
if maxUsers == -1:
maxUsers = df['User Number'].max()
maxMovies = df['Movie Number'].max()
maxBins= df['bin'].max()
numRatings = df.shape[0]
print('maxUsers:', maxUsers, 'maxMovies:', maxMovies, 'maxBins', maxBins, 'numRatings', numRatings)
newFileName = label + '_mm'
newFileLocation = "graphchi-cpp/" + newFileName
print('Making new file', newFileName)
df.to_csv(newFileLocation, sep=' ', index=False, header=False)
print('Finished reading in data, need to prepend info')
# need this for the first row for the Matrix Market Exchange Format
rowsColsEntries = str(maxUsers) + ' ' + str(maxMovies) + ' ' + str(maxBins) + ' ' + str(numRatings)
print('num Users, num Movies, num Bins, num ratings', rowsColsEntries)
fileHeader = '%%MatrixMarket matrix coordinate real general'
prependEntries(newFileLocation, fileHeader, rowsColsEntries)
print('file processing done for', label, 'new file created', newFileLocation, '\n')
return(maxUsers, maxMovies, maxBins)
maxUsers = -1
maxMovies = -1
maxBins= -1
maxUsers, maxMovies, maxBins= fileToMatrixMarket_MU('mu_train.csv', 'mu_trainTime', maxUsers, maxMovies, maxBins)
fileToMatrixMarket_MU('mu_val.csv', 'mu_valTime', maxUsers, maxMovies, maxBins)
fileToMatrixMarket_MU('mu_probe.csv', 'mu_probeTime', maxUsers, maxMovies, maxBins)
fileToMatrixMarket_MU('mu_qual.csv', 'mu_qualTime', maxUsers, maxMovies, maxBins)
fileToMatrixMarket_MU('mu_qual_val.csv', 'mu_qual_valTime', maxUsers, maxMovies, maxBins)
fileToMatrixMarket_MU('mu_qual_probe.csv', 'mu_qual_probeTime', maxUsers, maxMovies, maxBins)