forked from abasirat/principal_word_vectors
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprinc_wvec.sh
52 lines (38 loc) · 1.38 KB
/
princ_wvec.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/bash
#
# This program is written by Ali Basirat [email protected] as part of the
# project Principla Word Vectors at http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-353866
# You are allowed to modify or distribute it if you keep this header part
#
#
DATA_PREFIX=./cwvec/test/raw # raw dep_index or raw
CORPUS=${DATA_PREFIX}.txt
CMAT=${DATA_PREFIX}.bin
WORD_VECTORS=${DATA_PREFIX}.wvec
EMBEDDINGS=${DATA_PREFIX}.wembed
CORPUS_TYPE=raw # raw or annotated
CONTEXT_TYPE=pow # or pow or indexed
MEM=4
VOCAB=${DATA_PREFIX}.vcb
MIN_VCOUNT=100
FEATURE=${DATA_PREFIX}.feat
#MIN_FCOUNT=100 # only with annotated corpora
WINDOW=5
CWVEC=cwvec/build/cwvec
PWVEC=pwvec/python/pwvec.py
FEATURE_SELECTION=frequency # or entropy
TRANSFORMATION=MaximumEntropy #ppmi, Hellinger, or MaximumEntropy
$CWVEC --input $CORPUS --output $CMAT \
--corpus-type $CORPUS_TYPE \
--window $WINDOW \
--context-type $CONTEXT_TYPE \
--vocab $VOCAB --min-vcount $MIN_VCOUNT \
--feature $FEATURE \
--normalize \
--max-memory $MEM --verbose
if [ $? -ne 0 ]; then echo "error while running cwvec "; fi
python3 $PWVEC $CMAT $WORD_VECTORS $FEATURE_SELECTION $TRANSFORMATION
if [ $? -ne 0 ]; then echo "error while running pwvec "; fi
paste $VOCAB $WORD_VECTORS |\
awk '{printf("%s",$1) ; for (i=3;i<=NF;i++) printf(" %s", $i) ; printf("\n")}' > $EMBEDDINGS
if [ $? -ne 0 ]; then echo "error while pasting "; fi