1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""================================================
UNIT TESTING EXAMPLE
===================================================
This is an example of how to include unit tests in your code
The origional problem is https://www.codingame.com/ide/puzzle/byte-pair-encoding
The objective is the compress a string by replacing character pairs recursively
For example, if given the string "aaabdaaabac"
Use the unused characters "ZYXW..." to replace pairs starting with the most frequent
Z replaces aa, and the string becomes ZabdZabac
Y replaces Za, and the string becomes YbdYbac
X replaces Yb, and the string becomes XdXac
There are no further replacements, because we only replace if we find at least one repeat of a pair.
"""
from string import ascii_lowercase
from string import ascii_uppercase
def mostCommonPair(data):
"""
Find the most frequent letter pair
Only count non-overlapping pairs
return None if less than 2 pairs are found
"""
pairs = {}
order = []
previous = None
for i in range(len(data)-1):
current = data[i:i+2]
if current != previous:
if current not in pairs:
order.append(current)
pairs[current] = pairs.get(current,0) + 1
previous = current
else:
# found an overlapping pair. reset previous
# so that "aaaa" will not be skipped completely
previous = None
highest_score = 0
highest_pair = None
for key in order:
value = pairs[key]
if value > highest_score:
highest_score = value
highest_pair = key
# return highest pair if 2 or more instances were found
return None if highest_score < 2 else highest_pair
# test to make sure mostCommonPair behaves correctly
assert( mostCommonPair("ZabdZabac") == "Za" )
assert( mostCommonPair("ZabdZababc") == "ab" )
assert( mostCommonPair("abZabdZac") == "ab" )
def readInData():
"""
read in the text
throw away all unneeded data like the number of lines
"""
n, m = [int(i) for i in input().split()]
return "".join([input() for i in range(n)])
def calculateUnused(data):
"""
Get a list of letters that are unused that will
be available for substitution.
Returns a list of reverse alphabetical letters
not seen in the data
"""
used = set([x for x in data])
unused = [x for x in ascii_lowercase + ascii_uppercase][::-1]
return [x for x in unused if x not in used]
# test to make sure calculateUnused is behaving correctly
assert( 'X' not in calculateUnused("abcdGBX") )
assert( 'Y' in calculateUnused("abcdGBX") )
assert( ['Z','Y','W'] == calculateUnused("abcdGBX")[:3] )
assert( len( calculateUnused("abcdGBX") ) == 26 + 26 - 7 )
def compressData(data,unused):
"""
keep compressing the data until there is no more ability to compress
Returns compressed data and list of transforms
"""
changes = []
most_common_pair = None if len(unused) == 0 else mostCommonPair(data)
while most_common_pair is not None:
changes.append(f"{unused[0]} = {most_common_pair}")
data = data.replace(most_common_pair, unused[0])
unused = unused[1:]
most_common_pair = None if len(unused) == 0 else mostCommonPair(data)
return data, changes
# tests to make sure compression works properly
assert( compressData("aaabdaaabac",["Z","Y","X","W"])[0] == "XdXac" )
assert( len(compressData("aaabdaaabac",["Z","Y","X","W"])[1]) == 3 )
assert( compressData("aaabdaaabac",["Z","Y","X","W"])[1][0] == "Z = aa" )
assert( compressData("aaabdaaabac",["Z","Y","X","W"])[1][1] == "Y = Za" )
assert( compressData("aaabdaaabac",["Z","Y","X","W"])[1][2] == "X = Yb" )
"""================================================
Main Program
================================================"""
data = "this is some test data and we will see what it does"
# uncomment this line to use this to read from stdin
# data = readInData( )
unused = calculateUnused(data)
data, changes = compressData(data, unused)
print(data)
for change in changes:
print(change)