@@ -71,6 +71,7 @@ class excel:
7171 QUOTE_STRINGS , QUOTE_NOTNULL
7272from _csv import Dialect as _Dialect
7373
74+ from collections import defaultdict , Counter
7475from io import StringIO
7576
7677__all__ = ["QUOTE_MINIMAL" , "QUOTE_ALL" , "QUOTE_NONNUMERIC" , "QUOTE_NONE" ,
@@ -367,25 +368,30 @@ def _guess_delimiter(self, data, delimiters):
367368
368369 data = list (filter (None , data .split ('\n ' )))
369370
370- ascii = [ chr (c ) for c in range (127 )] # 7-bit ASCII
371+ ascii = { chr (c ) for c in range (127 )} # 7-bit ASCII
371372
372373 # build frequency tables
373374 chunkLength = min (10 , len (data ))
374375 iteration = 0
375- charFrequency = {}
376+ # {char -> {count_per_line -> num_lines_with_that_count}}
377+ charFrequency = defaultdict (Counter )
376378 modes = {}
377379 delims = {}
378380 start , end = 0 , chunkLength
379381 while start < len (data ):
380382 iteration += 1
381- for line in data [start :end ]:
382- for char in ascii :
383- metaFrequency = charFrequency .get (char , {})
384- # must count even if frequency is 0
385- freq = line .count (char )
386- # value is the mode
387- metaFrequency [freq ] = metaFrequency .get (freq , 0 ) + 1
388- charFrequency [char ] = metaFrequency
383+ chunk = data [start :end ]
384+ candidate_chars = set ("" .join (chunk ))
385+ candidate_chars .intersection_update (ascii )
386+ for line in chunk :
387+ for char in candidate_chars :
388+ count = line .count (char )
389+ charFrequency [char ][count ] += 1
390+
391+ missing_chars = ascii .difference (candidate_chars )
392+ chunk_len = len (chunk )
393+ for char in missing_chars :
394+ charFrequency [char ][0 ] += chunk_len
389395
390396 for char in charFrequency .keys ():
391397 items = list (charFrequency [char ].items ())
0 commit comments