Skip to content

Data Mining

Jaimie Murdock edited this page Apr 24, 2018 · 3 revisions

Data mining currently runs on a separate ec2 instance, accessible only from the inside.

The server is launched and shutdown via a cronjob on the master node.

Configuring server

sudo yum install git make gcc-c++
sudo useradd inphosite
sudo groupadd inpho
sudo usermod -G inpho inphosite
sudo usermod -a -G inphosite inphosite
sudo mkdir /var/inpho
sudo chown inphosite:inpho /var/inpho

sudo su inphosite
cd /tmp
wget https://repo.continuum.io/miniconda/Miniconda2-4.3.30-Linux-x86_64.sh
bash Miniconda2-4.3.30-Linux-x86_64.sh -b
echo 'export PATH="/home/inphosite/miniconda2/bin:$PATH"' >> ~/.bashrc
. ~/.bashrc

conda install -y mysql-python sphinx docutils sphinx nltk
python -m nltk.downloader punkt

cd /var/inpho
git clone https://github.com/inpho/inpho.git
cd inpho
python setup.py develop

scp inphoproject.org:/var/inpho/inpho.ini /var/inpho/

mkdir /var/inpho/data
scp inphoproject.org:/var/inpho/data/.apriori_config /var/inpho/data/

Mount the SEP

sudo mkdir /var/sep
sudo useradd sep
sudo usermod -G inpho sep
sudo chown sep:inpho /var/sep

Add this to /etc/fstab:

fs-2c877655.efs.us-east-2.amazonaws.com:/    /var/sep    nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0

Create mining.sh

mkdir /var/inpho/bin
#!/bin/sh
MINING_PATH=/var/inpho/inpho/inpho/corpus/sep.py
cd /var/inpho/data
nice python $MINING_PATH --all --occur
nice python $MINING_PATH --all
nice python $MINING_PATH --idea
nice python $MINING_PATH --thinker
nice python $MINING_PATH --load --all
nice python $MINING_PATH --load --idea
nice python $MINING_PATH --load --thinker

Test

/usr/bin/time -v bash mining.sh 1> mining.log 2> mining.err

Cronjob for Master node

#!/bin/bash
INSTANCE_ID=i-0123456789abcdef
IP=`aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r .Reservations[].Instances[].PrivateIpAddress`
STATE=`aws ec2 describe-instances --instance-id $INSTANCE_ID | jq .Reservations[].Instances[].State.Code`

if [[ $STATE != 16 ]]; then
  echo "starting $INSTANCE_ID"
  aws ec2 start-instances --instance-ids $INSTANCE_ID
  echo "waiting for $INSTANCE_ID"
  aws ec2 wait instance-running --instance-ids $INSTANCE_ID && echo "$INSTANCE_ID running!"
else
  echo "already started $INSTANCE_ID - CHECK IF UNEXPECTED!"
fi
#ssh -i ~/.ssh/inphoprojectaws.pem inphosite@$IP "echo 'here we are!'"
echo "begining data mining"
ssh inphosite@$IP "/var/inpho/bin/mining.sh" 1>/var/inpho/log/mining.log 2>/var/inpho/log/mining.err
aws ec2 stop-instances --instance-ids $INSTANCE_ID