External sort: Difference between revisions
mNo edit summary |
(python) |
||
Line 2:
The sorting algorithm can be any popular sort, like quicksort. For simplicity one can assume that the file consists of fixed length integers and that the sort function is less-than (<).
=={{header|python}}==
A technique demonstrated with a short string character data.
<lang python>
#! /usr/bin/python3
'''
$ # example session in bash
$ python3 external_sort.py
expect 123456789
memory size 1 passed
memory size 2 passed
memory size 3 passed
memory size 4 passed
memory size 5 passed
memory size 6 passed
memory size 7 passed
memory size 8 passed
memory size 9 passed
memory size 10 passed
memory size 11 passed
'''
import io
def sort_large_file(n: int, source: open, sink: open, file_opener = open)->None:
'''
approach:
break the source into files of size n
sort each of these files
merge these onto the sink
'''
# store sorted chunks into files of size n
mergers = []
while True:
text = list(source.read(n))
if not len(text):
break;
text.sort()
merge_me = file_opener()
merge_me.write(''.join(text))
mergers.append(merge_me)
merge_me.seek(0)
# merge onto sink
stack_tops = [f.read(1) for f in mergers]
while stack_tops:
c = min(stack_tops)
sink.write(c)
i = stack_tops.index(c)
t = mergers[i].read(1)
if t:
stack_tops[i] = t
else:
del stack_tops[i]
mergers[i].close()
del mergers[i]
def main():
'''
test case
sort 6,7,8,9,2,5,3,4,1 with several memory sizes
'''
# load test case into a file like object
input_file_too_large_for_memory = io.StringIO('678925341')
# generate the expected output
t = list(input_file_too_large_for_memory.read())
t.sort()
expect = ''.join(t)
print('expect', expect)
# attempt to sort with several memory sizes
for memory_size in range(1,12):
input_file_too_large_for_memory.seek(0)
output_file_too_large_for_memory = io.StringIO()
sort_large_file(memory_size, input_file_too_large_for_memory, output_file_too_large_for_memory, io.StringIO)
output_file_too_large_for_memory.seek(0)
assert(output_file_too_large_for_memory.read() == expect)
print('memory size {} passed'.format(memory_size))
if __name__ == '__main__':
example = main
example()
</lang>
|
Revision as of 06:21, 12 March 2017
Sort a huge file too large to fit into memory. The algorithm consists in reading a large file to be sorted in chunks of data small enough to fit in main memory, sort each of the chunks, write them out to a temporary file, and finally combined the smaller subfiles into a single larger file. For more info see: https://en.wikipedia.org/wiki/External_sorting
The sorting algorithm can be any popular sort, like quicksort. For simplicity one can assume that the file consists of fixed length integers and that the sort function is less-than (<).
python
A technique demonstrated with a short string character data. <lang python>
- ! /usr/bin/python3
$ # example session in bash $ python3 external_sort.py expect 123456789 memory size 1 passed memory size 2 passed memory size 3 passed memory size 4 passed memory size 5 passed memory size 6 passed memory size 7 passed memory size 8 passed memory size 9 passed memory size 10 passed memory size 11 passed
import io
def sort_large_file(n: int, source: open, sink: open, file_opener = open)->None:
approach: break the source into files of size n sort each of these files merge these onto the sink
# store sorted chunks into files of size n mergers = [] while True: text = list(source.read(n)) if not len(text): break; text.sort() merge_me = file_opener() merge_me.write(.join(text)) mergers.append(merge_me) merge_me.seek(0)
# merge onto sink stack_tops = [f.read(1) for f in mergers] while stack_tops: c = min(stack_tops) sink.write(c) i = stack_tops.index(c) t = mergers[i].read(1) if t: stack_tops[i] = t else: del stack_tops[i] mergers[i].close() del mergers[i]
def main():
test case sort 6,7,8,9,2,5,3,4,1 with several memory sizes
# load test case into a file like object input_file_too_large_for_memory = io.StringIO('678925341')
# generate the expected output t = list(input_file_too_large_for_memory.read()) t.sort() expect = .join(t) print('expect', expect)
# attempt to sort with several memory sizes for memory_size in range(1,12): input_file_too_large_for_memory.seek(0) output_file_too_large_for_memory = io.StringIO() sort_large_file(memory_size, input_file_too_large_for_memory, output_file_too_large_for_memory, io.StringIO) output_file_too_large_for_memory.seek(0) assert(output_file_too_large_for_memory.read() == expect) print('memory size {} passed'.format(memory_size))
if __name__ == '__main__':
example = main example()
</lang>