inbreast.py

#import dicom # some machines not install pydicom
import scipy.misc
import numpy as np 
from sklearn.model_selection import StratifiedKFold
import cPickle
#import matplotlib
#import matplotlib.pyplot as plt 
from skimage.filters import threshold_otsu
import os
from os.path import join as join
import csv
import scipy.ndimage
import dicom
#import cv2
path = '../AllDICOMs/'
preprocesspath = '../preprocesspath/'
labelfile = './label.txt'

def readlabel():
  '''read the label as a dict from labelfile'''
  mydict = {}
  with open(labelfile, 'r') as f:
    flines = f.readlines()
    for line in flines:
      data = line.split()
      if int(data[1]) == 0:
        mydict[data[0]] = int(data[1])
      else:
        assert(int(data[1])==2 or int(data[1])==1)
        mydict[data[0]] = int(data[1])-1
  return mydict

def readdicom(mydict):
  '''read the dicom image, rename it consistently with the name in labels, crop and resize, and save as pickle.
  mydict is the returned value of readlabel'''
  img_ext = '.dcm'
  img_fnames = [x for x in os.listdir(path) if x.endswith(img_ext)]
  for f in img_fnames:
    names = f.split('_')
    if names[0] not in mydict:
      print(names[0]+'occur error')
    dicom_content = dicom.read_file(join(path,f))
    img = dicom_content.pixel_array
    '''fig = plt.figure()
    ax1 = plt.subplot(3,3,1)
    ax2 = plt.subplot(3,3,2)
    ax3 = plt.subplot(3,3,3)
    ax4 = plt.subplot(3,3,4)
    ax5 = plt.subplot(3,3,5)
    ax6 = plt.subplot(3,3,6)
    ax7 = plt.subplot(3,3,7)
    ax8 = plt.subplot(3,3,8)
    ax9 = plt.subplot(3,3,9)
    ax1.imshow(img, cmap='Greys_r')
    ax1.set_title('Original')
    ax1.axis('off')'''
    
    thresh = threshold_otsu(img)
    binary = img > thresh
    #ax2.imshow(binary, cmap='Greys_r')
    #ax2.set_title('mask')
    #ax2.axis('off')
    
    minx, miny = 0, 0
    maxx, maxy = img.shape[0], img.shape[1]
    for xx in xrange(img.shape[1]):
      if sum(binary[xx, :]==0) < binary.shape[1]-60:
        minx = xx
        break
    for xx in xrange(img.shape[0]-1,0,-1):
      if sum(binary[xx, :]==0) < binary.shape[1]-60:
        maxx = xx
        break
    if names[3] == 'R':
      maxy = img.shape[1]
      for yy in xrange(int(img.shape[1]*3.0/4), -1, -1):
        if sum(binary[:,yy]==0) > binary.shape[0]-10: 
          miny = yy
          break
    else:
      miny = 0
      for yy in xrange(int(img.shape[1]/4.0), img.shape[1], 1):
        if sum(binary[:,yy]==0) > binary.shape[0]-10: 
          maxy = yy
          break
    print(minx, maxx, miny, maxy)
    #ax3.set_title('Foreground')
    #ax3.imshow(img[minx:maxx+1, miny:maxy+1], cmap='Greys_r')
    #ax3.axis('off')
    
    img = img.astype(np.float32)
    img1 = scipy.misc.imresize(img[minx:maxx+1, miny:maxy+1], (227, 227), interp='cubic')
    with open(join(preprocesspath, names[0])+'227.pickle', 'wb') as outfile:
      cPickle.dump(img1, outfile) 
    img1 = scipy.misc.imresize(img[minx:maxx+1, miny:maxy+1], (299, 299), interp='cubic')
    with open(join(preprocesspath, names[0])+'299.pickle', 'wb') as outfile:
      cPickle.dump(img1, outfile) 
    '''ax4.set_title('Resize')
    ax4.imshow(img, cmap='Greys_r')
    ax4.axis('off')

    img = img.astype(np.float32)
    img -= np.mean(img)
    img /= np.std(img)
    ax5.set_title('Norm')
    ax5.imshow(img, cmap='Greys_r')
    ax5.axis('off')
    with open(join(preprocesspath, names[0])+'norm.pickle', 'wb') as outfile:
      cPickle.dump(img, outfile)
      #imgshape = img.shape
    
    img = np.fliplr(img)
    ax6.set_title('Flip')
    ax6.imshow(img, cmap='Greys_r')
    ax6.axis('off')
    
    num_rot = np.random.choice(4)               #rotate 90 randomly
    img = np.rot90(img, num_rot)
    ax7.set_title('Rotation')
    ax7.imshow(img, cmap='Greys_r')
    ax7.axis('off')
    fig.savefig(join(preprocesspath, names[0])+'.jpg')
    plt.close(fig)'''

def cvsplit(fold, totalfold, mydict):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test
      return train, test
    count += 1

def cvsplitenhance(fold, totalfold, mydict, valfold=-1):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel
  sperate the data into train, validation, test'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  if valfold == -1: 
    valfold = (fold+1) % totalfold
  print('valfold'+str(valfold))
  trainls, valls, testls = [], [], []
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test[:]
      testls = test[:]
    elif count == valfold:
      valls = test[:]
    else:
      for i in test:
        trainls.append(i)
    count += 1
  return trainls, valls, testls

def loadim(fname, preprocesspath=preprocesspath):
  ''' from preprocess path load fname
  fname file name in preprocesspath
  aug is true, we augment im fliplr, rot 4'''
  ims = []
  with open(join(preprocesspath, fname), 'rb') as inputfile:
    im = cPickle.load(inputfile)
    #up_bound = np.random.choice(174)                          #zero out square
    #right_bound = np.random.choice(174)
    img = im
    #img[up_bound:(up_bound+50), right_bound:(right_bound+50)] = 0.0
    ims.append(img)
    inputfile.close()
  return ims

def loaddata(fold, totalfold, usedream=True, aug=True):
  '''get the fold th train and  test data from inbreast
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation'''
  mydict = readlabel()
  mydictkey = mydict.keys()
  mydictvalue = mydict.values()
  trainindex, testindex = cvsplit(fold, totalfold, mydict)
  if aug == True:
    traindata, trainlabel = np.zeros((6*len(trainindex),227,227)), np.zeros((6*len(trainindex),))
  else:
    traindata, trainlabel = np.zeros((len(trainindex),227,227)), np.zeros((len(trainindex),))
  testdata, testlabel =  np.zeros((len(testindex),227,227)), np.zeros((len(testindex),))
  traincount = 0
  for i in xrange(len(trainindex)):
    ims = loadim(mydictkey[trainindex[i]]+'.pickle', aug=aug)
    for im in ims:
      traindata[traincount, :, :] = im
      trainlabel[traincount] = mydictvalue[trainindex[i]]
      traincount += 1
  assert(traincount==traindata.shape[0])
  testcount = 0
  for i in xrange(len(testindex)):
    ims = loadim(mydictkey[testindex[i]]+'.pickle', aug=aug)
    testdata[testcount,:,:] = ims[0]
    testlabel[testcount] = mydictvalue[testindex[i]]
    testcount += 1
  assert(testcount==testdata.shape[0])
  if usedream:
    outx, outy = extractdreamdata()
    traindata = np.concatenate((traindata,outx), axis=0)
    trainlabel = np.concatenate((trainlabel,outy), axis=0)
  return traindata, trainlabel, testdata, testlabel

def loaddataenhance(fold, totalfold, valfold=-1, valnum=60):
  '''get the fold th train and  test data from inbreast
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation'''
  mydict = readlabel()
  mydictkey = mydict.keys()
  mydictvalue = mydict.values()
  trainindex, valindex, testindex = cvsplitenhance(fold, totalfold, mydict, valfold=valfold)
  traindata, trainlabel = np.zeros((len(trainindex),227,227)), np.zeros((len(trainindex),))
  valdata, vallabel =  np.zeros((len(valindex),227,227)), np.zeros((len(valindex),))
  testdata, testlabel =  np.zeros((len(testindex),227,227)), np.zeros((len(testindex),))
  traincount = 0
  for i in xrange(len(trainindex)):
    ims = loadim(mydictkey[trainindex[i]]+'227.pickle')
    for im in ims:
      traindata[traincount, :, :] = im
      trainlabel[traincount] = int(mydictvalue[trainindex[i]])
      traincount += 1
  assert(traincount==traindata.shape[0])
  valcount = 0
  for i in xrange(len(valindex)):
    ims = loadim(mydictkey[valindex[i]]+'227.pickle')
    valdata[valcount,:,:] = ims[0]
    vallabel[valcount] = int(mydictvalue[valindex[i]])
    valcount += 1
  assert(valcount==valdata.shape[0])
  testcount = 0
  for i in xrange(len(testindex)):
    #print mydictkey[testindex[i]]
    ims = loadim(mydictkey[testindex[i]]+'227.pickle')
    testdata[testcount,:,:] = ims[0]
    testlabel[testcount] = int(mydictvalue[testindex[i]])
    testcount += 1
  assert(testcount==testdata.shape[0])
  #print(valdata.shape)
  randindex = np.random.permutation(valdata.shape[0])
  valdata = valdata[randindex,:,:]
  vallabel = vallabel[randindex]
  #print(valdata.shape)
  traindata = np.concatenate((traindata, valdata[valnum:,:,:]), axis=0)
  trainlabel = np.concatenate((trainlabel, vallabel[valnum:]), axis=0)
  valdata = valdata[:valnum,:,:]
  vallabel = vallabel[:valnum]
  maxvalue = (traindata.max()*1.0)
  print('inbreast max %f', maxvalue)
  traindata = traindata / maxvalue
  valdata = valdata / maxvalue
  testdata = testdata / maxvalue
  print('train data feature')
  #meanx = traindata.mean()
  #stdx = traindata.std()
  #traindata -= meanx
  #traindata /= stdx
  #valdata -= meanx
  #valdata /= stdx
  #testdata -= meanx
  #testdata /= stdx
  print(traindata.mean(), traindata.std(), traindata.max(), traindata.min())
  print('val data feature')
  print(valdata.mean(), valdata.std(), valdata.max(), valdata.min())
  print('test data feature')
  print(testdata.mean(), testdata.std(), testdata.max(), testdata.min())
  #meandata = traindata.mean()
  #stddata = traindata.std()
  #traindata = traindata - meandata
  #traindata = traindata / stddata
  #valdata = valdata - meandata
  #valdata = valdata / stddata
  #testdata = testdata - meandata
  #testdata = testdata / stddata
  return traindata, trainlabel, valdata, vallabel, testdata, testlabel

if __name__ == '__main__':
  traindata, trainlabel, testdata, testlabel = loaddata(0, 5)
  print(sum(trainlabel), sum(testlabel))

  traindata, trainlabel, valdata, vallabel, testdata, testlabel = loaddataenhance(0, 5)
  print(sum(trainlabel), sum(vallabel), sum(testlabel))