3434from email .parser import HeaderParser
3535import codecs
3636
37- __version__ = "1.2 "
37+ __version__ = "1.3 "
3838
3939
4040MESSAGES = {}
@@ -60,21 +60,57 @@ def add(ctxt, id, str, fuzzy):
6060def generate ():
6161 "Return the generated output."
6262 global MESSAGES
63+
64+ def hash_insert_entry (string , i ):
65+ hash_val = hashpjw (string )
66+ hash_cursor = hash_val % hash_tab_size
67+ inc = 1 + (hash_val % (hash_tab_size - 2 ))
68+ while hash_table [hash_cursor ] != 0 :
69+ hash_cursor += inc
70+ hash_cursor %= hash_tab_size
71+ hash_table [hash_cursor ] = i + 1
72+
73+ # From [gettext.git]/gettext-tools/src/write-mo.c:
74+ # Each string has an associate hashing value V, computed by a fixed
75+ # function. To locate the string we use open addressing with double
76+ # hashing. The first index will be V % M, where M is the size of the
77+ # hashing table. If no entry is found, iterating with a second,
78+ # independent hashing function takes place. This second value will
79+ # be 1 + V % (M - 2).
80+ # The approximate number of probes will be
81+ #
82+ # for unsuccessful search: (1 - N / M) ^ -1
83+ # for successful search: - (N / M) ^ -1 * ln (1 - N / M)
84+ #
85+ # where N is the number of keys.
86+ #
87+ # If we now choose M to be the next prime bigger than 4 / 3 * N,
88+ # we get the values
89+ # 4 and 1.85 resp.
90+ # Because unsuccessful searches are unlikely this is a good value.
91+ # Formulas: [Knuth, The Art of Computer Programming, Volume 3,
92+ # 766 Sorting and Searching, 1973, Addison Wesley]
93+ hash_tab_size = next_prime ((len (sorted (MESSAGES .keys ())) * 4 ) // 3 )
94+ if hash_tab_size <= 2 :
95+ hash_tab_size = 3
96+ hash_table = array .array ("I" , [0 ] * hash_tab_size )
97+
6398 # the keys are sorted in the .mo file
6499 keys = sorted (MESSAGES .keys ())
65100 offsets = []
66101 ids = strs = b''
67- for id in keys :
102+ for i , id in enumerate ( keys ) :
68103 # For each string, we need size and file offset. Each string is NUL
69104 # terminated; the NUL does not count into the size.
105+ hash_insert_entry (id , i )
70106 offsets .append ((len (ids ), len (id ), len (strs ), len (MESSAGES [id ])))
71107 ids += id + b'\0 '
72108 strs += MESSAGES [id ] + b'\0 '
73109 output = ''
74- # The header is 7 32-bit unsigned integers. We don't use hash tables, so
110+ ## FIX ME The header is 7 32-bit unsigned integers. We use hash tables, so
75111 # the keys start right after the index tables.
76112 # translated string.
77- keystart = 7 * 4 + 16 * len (keys )
113+ keystart = 7 * 4 + 16 * len (keys )+ hash_tab_size * 4
78114 # and the values start after the keys
79115 valuestart = keystart + len (ids )
80116 koffsets = []
@@ -86,13 +122,15 @@ def generate():
86122 voffsets += [l2 , o2 + valuestart ]
87123 offsets = koffsets + voffsets
88124 output = struct .pack ("Iiiiiii" ,
89- 0x950412de , # Magic
90- 0 , # Version
91- len (keys ), # # of entries
92- 7 * 4 , # start of key index
93- 7 * 4 + len (keys )* 8 , # start of value index
94- 0 , 0 ) # size and offset of hash table
125+ 0x950412de , # Magic
126+ 0 , # Version
127+ len (keys ), # # of entries
128+ 7 * 4 , # start of key index
129+ 7 * 4 + len (keys )* 8 , # start of value index
130+ hash_tab_size , # size of hash table
131+ 7 * 4 + 2 * (len (keys ) * 8 )) # offset of hash table
95132 output += array .array ("i" , offsets ).tobytes ()
133+ output += hash_table .tobytes ()
96134 output += ids
97135 output += strs
98136 return output
@@ -252,5 +290,37 @@ def main():
252290 make (filename , outfile )
253291
254292
293+ # Utilities for writing hash table
294+
295+ def hashpjw (str_param ):
296+ hval = 0
297+ for s in str_param :
298+ if not s :
299+ break
300+ hval <<= 4
301+ hval += s
302+ g = hval & 0xF << 28
303+ if g != 0 :
304+ hval ^= g >> 24
305+ hval ^= g
306+ return hval
307+
308+
309+ def next_prime (start ):
310+ def is_prime (num ):
311+ divn = 3
312+ sq = divn * divn
313+ while sq < num and num % divn != 0 :
314+ divn += 1
315+ sq += 4 * divn
316+ divn += 1
317+
318+ return num % divn != 0
319+
320+ candidate = start | 1
321+ while not is_prime (candidate ):
322+ candidate += 2
323+ return candidate
324+
255325if __name__ == '__main__' :
256326 main ()
0 commit comments