-
Notifications
You must be signed in to change notification settings - Fork 0
/
keep_first.py
executable file
·111 lines (85 loc) · 2.47 KB
/
keep_first.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
"""
Remove duplicate lines in a file, keeping only the first occurence of a line.
"""
from sys import argv,stdin,stderr,exit
from math import ceil
def usage(s=None):
message = """
usage: cat items_file | keep_first [options] > items_file
--head=<number> limit the number of input lines
--progress=<number> periodically report how many lines we've read"""
if (s == None): exit (message)
else: exit ("%s\n%s" % (s,message))
def main():
global debug
# parse the command line
headLimit = None
reportProgress = None
debug = []
for arg in argv[1:]:
if ("=" in arg):
argVal = arg.split("=",1)[1]
if (arg.startswith("--head=")):
headLimit = int_with_unit(argVal)
elif (arg.startswith("--progress=")):
reportProgress = int_with_unit(argVal)
elif (arg == "--debug"):
debug += ["debug"]
elif (arg.startswith("--debug=")):
debug += argVal.split(",")
elif (arg.startswith("--")):
usage("unrecognized option: %s" % arg)
else:
usage("unrecognized option: %s" % arg)
# read the items
lineSeen = {}
lineNum = 0
for line in stdin:
lineNum += 1
if (headLimit != None) and (lineNum > headLimit):
print >>stderr, "limit of %s lines reached" % (commatize(headLimit))
break
if (reportProgress != None) and (lineNum % reportProgress == 0):
print >>stderr, "progress: line %s" % (commatize(lineNum))
line = line.rstrip()
if (line in lineSeen): continue
lineSeen[line] = True
print line
# int_with_unit--
# Parse a string as an integer, allowing unit suffixes
def int_with_unit(s):
if (s.endswith("K")):
multiplier = 1000
s = s[:-1]
elif (s.endswith("M")):
multiplier = 1000 * 1000
s = s[:-1]
elif (s.endswith("G")):
multiplier = 1000 * 1000 * 1000
s = s[:-1]
else:
multiplier = 1
try: return int(s) * multiplier
except ValueError: return int(ceil(float(s) * multiplier))
# commatize--
# Convert a numeric string into one with commas.
def commatize(s):
if (type(s) != str): s = str(s)
(prefix,val,suffix) = ("",s,"")
if (val.startswith("-")): (prefix,val) = ("-",val[1:])
if ("." in val):
(val,suffix) = val.split(".",1)
suffix = "." + suffix
try: int(val)
except: return s
digits = len(val)
if (digits > 3):
leader = digits % 3
chunks = []
if (leader != 0):
chunks += [val[:leader]]
chunks += [val[ix:ix+3] for ix in xrange(leader,digits,3)]
val = ",".join(chunks)
return prefix + val + suffix
if __name__ == "__main__": main()