Skip to content
Binary file modified Lib/test/test_tools/msgfmt_data/fuzzy.mo
Binary file not shown.
Binary file modified Lib/test/test_tools/msgfmt_data/general.mo
Binary file not shown.
5 changes: 4 additions & 1 deletion Lib/test/test_tools/test_msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def test_compilation(self):

self.assertDictEqual(actual._catalog, expected._catalog)

def test_hash_table(self):
pass
Comment thread
StanFromIreland marked this conversation as resolved.
Outdated

def test_translations(self):
with open(data_dir / 'general.mo', 'rb') as f:
t = GNUTranslations(f)
Expand Down Expand Up @@ -126,7 +129,7 @@ def test_version(self):
for option in ('--version', '-V'):
res = assert_python_ok(msgfmt, option)
out = res.out.decode('utf-8').strip()
self.assertEqual('msgfmt.py 1.2', out)
self.assertEqual('msgfmt.py 1.3', out)

def test_invalid_option(self):
res = assert_python_failure(msgfmt, '--invalid-option')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:program:`msgfmt` now generates GNU hash tables.
90 changes: 80 additions & 10 deletions Tools/i18n/msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from email.parser import HeaderParser
import codecs

__version__ = "1.2"
__version__ = "1.3"
Comment thread
StanFromIreland marked this conversation as resolved.
Outdated


MESSAGES = {}
Expand All @@ -60,21 +60,57 @@ def add(ctxt, id, str, fuzzy):
def generate():
"Return the generated output."
global MESSAGES

def hash_insert_entry(string, i):
hash_val = hashpjw(string)
hash_cursor = hash_val % hash_tab_size
inc = 1 + (hash_val % (hash_tab_size - 2))
while hash_table[hash_cursor] != 0:
Comment thread
StanFromIreland marked this conversation as resolved.
Outdated
hash_cursor += inc
hash_cursor %= hash_tab_size
hash_table[hash_cursor] = i + 1

# From [gettext.git]/gettext-tools/src/write-mo.c:
# Each string has an associate hashing value V, computed by a fixed
# function. To locate the string we use open addressing with double
# hashing. The first index will be V % M, where M is the size of the
# hashing table. If no entry is found, iterating with a second,
# independent hashing function takes place. This second value will
# be 1 + V % (M - 2).
# The approximate number of probes will be
#
# for unsuccessful search: (1 - N / M) ^ -1
# for successful search: - (N / M) ^ -1 * ln (1 - N / M)
#
# where N is the number of keys.
#
# If we now choose M to be the next prime bigger than 4 / 3 * N,
# we get the values
# 4 and 1.85 resp.
# Because unsuccessful searches are unlikely this is a good value.
# Formulas: [Knuth, The Art of Computer Programming, Volume 3,
# 766 Sorting and Searching, 1973, Addison Wesley]
hash_tab_size = next_prime((len(sorted(MESSAGES.keys())) * 4) // 3)
Comment thread
StanFromIreland marked this conversation as resolved.
Outdated
if hash_tab_size <= 2:
hash_tab_size = 3
hash_table = array.array("I", [0] * hash_tab_size)

# the keys are sorted in the .mo file
keys = sorted(MESSAGES.keys())
offsets = []
ids = strs = b''
for id in keys:
for i, id in enumerate(keys):
# For each string, we need size and file offset. Each string is NUL
# terminated; the NUL does not count into the size.
hash_insert_entry(id, i)
offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
ids += id + b'\0'
strs += MESSAGES[id] + b'\0'
output = ''
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
## FIX ME The header is 7 32-bit unsigned integers. We use hash tables, so
# the keys start right after the index tables.
# translated string.
keystart = 7*4+16*len(keys)
keystart = 7*4+16*len(keys)+hash_tab_size*4
# and the values start after the keys
valuestart = keystart + len(ids)
koffsets = []
Expand All @@ -86,13 +122,15 @@ def generate():
voffsets += [l2, o2+valuestart]
offsets = koffsets + voffsets
output = struct.pack("Iiiiiii",
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
0, 0) # size and offset of hash table
0x950412de, # Magic
Comment thread
StanFromIreland marked this conversation as resolved.
Outdated
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
hash_tab_size, # size of hash table
7 * 4 + 2 * (len(keys) * 8)) # offset of hash table
output += array.array("i", offsets).tobytes()
output += hash_table.tobytes()
output += ids
output += strs
return output
Expand Down Expand Up @@ -253,5 +291,37 @@ def main():
make(filename, outfile)


# Utilities for writing hash table

def hashpjw(str_param):
hval = 0
for s in str_param:
if not s:
break
hval <<= 4
hval += s
g = hval & 0xF << 28
if g != 0:
hval ^= g >> 24
hval ^= g
return hval


def next_prime(start):
def is_prime(num):
divn = 3
sq = divn * divn
while sq < num and num % divn != 0:
divn += 1
sq += 4 * divn
divn += 1

return num % divn != 0

candidate = start | 1
while not is_prime(candidate):
candidate += 2
return candidate

if __name__ == '__main__':
main()
Loading