-
Notifications
You must be signed in to change notification settings - Fork 23
/
project_vectors.m
executable file
·57 lines (45 loc) · 2.16 KB
/
project_vectors.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
function project_vectors(origForeignVecFile, origEnVecFile, subsetEnVecFile, subsetForeignVecFile, outputEnFile, outputForeignFile, truncRatio)
% first column is words, hence not being read
origEnVecs = dlmread(origEnVecFile, ' ', 0, 1);
origForeignVecs = dlmread(origForeignVecFile, ' ', 0, 1);
subsetEnVecs = dlmread(subsetEnVecFile, ' ', 0, 1);
subsetForeignVecs = dlmread(subsetForeignVecFile, ' ', 0, 1);
% word2vec embeddings have a trailing space which matlab parses as an additional
% column of all zeros. If the last column is all zeros, remove it.
origEnVecs_cols = size(origEnVecs, 2);
origForeignVecs_cols = size(origForeignVecs, 2);
if norm(origEnVecs(:, origEnVecs_cols)) == 0
origEnVecs_cols = origEnVecs_cols - 1;
origEnVecs = origEnVecs(:, 1:origEnVecs_cols);
end;
if norm(origForeignVecs(:, origForeignVecs_cols)) == 0
origForeignVecs_cols = origForeignVecs_cols - 1;
origForeignVecs = origForeignVecs(:, 1:origForeignVecs_cols);
end;
% Normalize all the matrices by rows
origEnVecs = normr(origEnVecs);
origForeignVecs = normr(origForeignVecs);
subsetEnVecs = normr(subsetEnVecs);
subsetForeignVecs = normr(subsetForeignVecs);
% Perform CCA on the subset of the aligned vectors
[A, B, r] = canoncorr(subsetEnVecs, subsetForeignVecs);
% Project the original english matrix onto the new direction
shape = size(origEnVecs);
shapeA = size(A);
numDimA = shapeA(2);
origEnVecsProjected = (origEnVecs-repmat(mean(origEnVecs),shape(1),1)) * A(:,1:ceil(truncRatio*numDimA));
origEnVecsProjected = normr(origEnVecsProjected);
% Project the original foreign matrix onto the new direction
shape = size(origForeignVecs);
shapeB = size(B);
numDimB = shapeB(2);
origForeignVecsProjected = (origForeignVecs-repmat(mean(origForeignVecs),shape(1),1)) * B(:,1:ceil(truncRatio*numDimB));
origForeignVecsProjected = normr(origForeignVecsProjected);
% Write the projected english vectors to file
dlmwrite(outputEnFile, origEnVecsProjected, ' ');
dlmwrite(outputForeignFile, origForeignVecsProjected, ' ');
% Write the projection matrices to file
dlmwrite(strcat(outputEnFile, '.trans'), A, ' ');
dlmwrite(strcat(outputForeignFile, '.trans'), B, ' ');
% Delete all matrices from memory
clear;