-
Notifications
You must be signed in to change notification settings - Fork 33
/
length_encoder.pl
executable file
·114 lines (90 loc) · 2.67 KB
/
length_encoder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!?usr/bin/perl
# Author: Daniel "Trizen" Șuteu
# License: GPLv3
# Date: 04 May 2015
# Website: https://github.com/trizen
# A very basic length encoder
use 5.010;
use strict;
use warnings;
use Data::Dump qw(pp);
# produce encode and decode dictionary from a tree
sub walk {
my ($node, $code, $h) = @_;
my $c = $node->[0];
if (ref $c) { walk($c->[$_], $code . $_, $h) for 0, 1 }
else { $h->{$c} = $code }
$h;
}
# make a tree, and return resulting dictionaries
sub mktree {
my %freq = @_;
my @nodes = map { [$_, $freq{$_}] } keys %freq;
if (@nodes == 1) {
return {$nodes[0][0] => '0'};
}
do { # poor man's priority queue
@nodes = sort { $a->[1] <=> $b->[1] } @nodes;
my ($x, $y) = splice(@nodes, 0, 2);
push @nodes, [[$x, $y], $x->[1] + $y->[1]];
} while (@nodes > 1);
walk($nodes[0], '', {}, {});
}
sub length_encoder {
my ($str) = @_;
my %table;
my @chars = split(//, $str);
my $lim = $#chars;
my %t;
for (my $i = 0 ; $i < $lim ; $i++) {
for (my $j = $i + 1 ; $j <= $lim ; $j++) {
last if $j + ($j - $i) + 1 > $lim;
my $key = join('', @chars[$i .. $j]);
if (join('', @chars[$j + 1 .. $j + ($j - $i) + 1]) eq $key) {
if (not exists $t{$key}) {
if (exists $t{substr($key, 0, -1)}) {
last;
}
$t{$key} = length($key);
}
else {
$t{$key}++;
}
}
}
}
my ($dict) = keys(%t) ? mktree(%t) : {};
my @sorted_tokens =
sort { length($dict->{$a}) <=> length($dict->{$b}) or $t{$b} <=> $t{$a} or $a cmp $b } keys %t;
say "Weights: ", pp(\%t);
say "Sorted: @sorted_tokens";
say "Bits: ", pp($dict);
my $regex = do {
my @tries = map { "(?<token>\Q$_\E)(?<rest>(?:\Q$_\E)*+)" } @sorted_tokens;
local $" = '|';
@sorted_tokens ? qr/^(?:@tries|(?<token>.))/s : qr/^(?<token>.)/s;
};
my $enc = '';
while ($str =~ s/$regex//) {
my $m = $+{token};
my $r = $+{rest};
if (defined $r) {
$enc .= ("[$dict->{$m}x" . (1 + length($r) / length($m)) . "]");
}
else {
$enc .= $m;
}
}
return $enc;
}
foreach my $str (
qw(
ABABABAB
ABABABABAAAAAAAAAAAAAFFFFFFFFFFFFFFFFFFFDDDDDDDDDDDDDDDDDDDDJKLABABVADSABABAB
DABDDB DABDDBBDDBA ABBDDD ABRACADABRA TOBEORNOTTOBEORTOBEORNOT
)
) {
say "Encoding: $str";
say "Encoded: ", length_encoder($str);
say "-" x 80;
}