We can efficiently sort massive amounts of data using External Merge Sort Algorithm, when the data being sorted don’t fit into the main memory (which is usually RAM) and resides in the slower external memory (which is usually a hard disk).

External Merge Sort uses a hybrid sort-merge technique. The chunks of data small enough to fit in the RAM are read, sorted, and written out to a temporary file during the sorting phase. In the merge phase, the sorted sub-files are combined into a single larger file.



In other words, external External Merge Sort sorts the chunks of data that fits in the main memory, then merges the sorted chunks together. i.e.

We first divide the file into runs such that the size of a run is small enough to fit into main memory.

Then sort each run in main memory using standard Merge Sort sorting algorithm.

Finally we merge the resulting runs together into successively bigger runs, until the file is sorted.



Following is C++ code to demonstrate External Merge Sort Algorithm:

#include <iostream> #include <algorithm> #include <queue> #include <limits> using namespace std; struct MinHeapNode { // The element to be stored int element; // index of the array from which the element is taken int i; }; // Comparison object to be used to order the heap struct comp { bool operator()(const MinHeapNode lhs, const MinHeapNode rhs) const { return lhs.element > rhs.element; } }; FILE* openFile(char* fileName, char* mode) { FILE* fp = fopen(fileName, mode); if (fp == NULL) { perror("Error while opening the file.

"); exit(EXIT_FAILURE); } return fp; } // Merges k sorted files. Names of files are assumed to be 1, 2, ... k void mergeFiles(char *output_file, int n, int k) { FILE* in[k]; for (int i = 0; i < k; i++) { char fileName[2]; // convert i to string snprintf(fileName, sizeof(fileName), "%d", i); // Open output files in read mode. in[i] = openFile(fileName, "r"); } // FINAL OUTPUT FILE FILE *out = openFile(output_file, "w"); // Create a min heap with k heap nodes. Every heap node has first // element of scratch output file MinHeapNode harr[k]; priority_queue<MinHeapNode, vector<MinHeapNode>, comp> pq; int i; for (i = 0; i < k; i++) { // break if no output file is empty and // index i will be no. of input files if (fscanf(in[i], "%d ", &harr[i].element) != 1) break; // Index of scratch output file harr[i].i = i; pq.push(harr[i]); } int count = 0; // Now one by one get the minimum element from min heap and replace // it with next element. Run till all filled input files reach EOF while (count != i) { // Get the minimum element and store it in output file MinHeapNode root = pq.top(); pq.pop(); fprintf(out, "%d ", root.element); // Find the next element that should replace current root of heap. // The next element belongs to same input file as the current // minimum element. if (fscanf(in[root.i], "%d ", &root.element) != 1 ) { root.element = numeric_limits<int>::max(); count++; } // Replace root with next element of input file pq.push(root); } // close input and output files for (int i = 0; i < k; i++) fclose(in[i]); fclose(out); } // Using a merge-sort algorithm, create the initial runs and divide them // evenly among the output files void createInitialRuns(char *input_file, int run_size, int num_ways) { // For big input file FILE *in = openFile(input_file, "r"); // output scratch files FILE* out[num_ways]; char fileName[2]; for (int i = 0; i < num_ways; i++) { // convert i to string snprintf(fileName, sizeof(fileName), "%d", i); // Open output files in write mode. out[i] = openFile(fileName, "w"); } // allocate a dynamic array large enough to accommodate runs of // size run_size int* arr = new int[run_size]; bool more_input = true; int next_output_file = 0; int i; while (more_input) { // write run_size elements into arr from input file for (i = 0; i < run_size; i++) { if (fscanf(in, "%d ", &arr[i]) != 1) { more_input = false; break; } } // sort array using merge sort sort(arr, arr + i); // write the records to the appropriate scratch output file // can't assume that the loop runs to run_size // since the last run's length may be less than run_size for (int j = 0; j < i; j++) fprintf(out[next_output_file], "%d ", arr[j]); next_output_file++; } // deallocate memory delete arr; // close input and output files for (int i = 0; i < num_ways; i++) fclose(out[i]); fclose(in); } // Program to demonstrate external sorting int main() { // No. of partitions of input file int num_ways = 10; // The size of each partition int run_size = 1000; char input_file[] = "input.txt"; char output_file[] = "output.txt"; FILE* in = openFile(input_file, "w"); srand(time(NULL)); // generate input for (int i = 0; i < num_ways * run_size; i++) fprintf(in, "%d ", rand()); fclose(in); // Read the input file, create the initial runs, // and assign the runs to the scratch output files createInitialRuns(input_file, run_size, num_ways); // Merge the runs using the K-way merging mergeFiles(output_file, run_size, num_ways); return 0; } 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 #include <iostream> #include <algorithm> #include <queue> #include <limits> using namespace std ; struct MinHeapNode { // The element to be stored int element ; // index of the array from which the element is taken int i ; } ; // Comparison object to be used to order the heap struct comp { bool operator ( ) ( const MinHeapNode lhs , const MinHeapNode rhs ) const { return lhs . element > rhs . element ; } } ; FILE * openFile ( char * fileName , char * mode ) { FILE * fp = fopen ( fileName , mode ) ; if ( fp == NULL ) { perror ( "Error while opening the file.

" ) ; exit ( EXIT_FAILURE ) ; } return fp ; } // Merges k sorted files. Names of files are assumed to be 1, 2, ... k void mergeFiles ( char * output_file , int n , int k ) { FILE * in [ k ] ; for ( int i = 0 ; i < k ; i ++ ) { char fileName [ 2 ] ; // convert i to string snprintf ( fileName , sizeof ( fileName ) , "%d" , i ) ; // Open output files in read mode. in [ i ] = openFile ( fileName , "r" ) ; } // FINAL OUTPUT FILE FILE * out = openFile ( output_file , "w" ) ; // Create a min heap with k heap nodes. Every heap node has first // element of scratch output file MinHeapNode harr [ k ] ; priority_queue < MinHeapNode , vector < MinHeapNode > , comp > pq ; int i ; for ( i = 0 ; i < k ; i ++ ) { // break if no output file is empty and // index i will be no. of input files if ( fscanf ( in [ i ] , "%d " , &harr [ i ] . element ) != 1 ) break ; // Index of scratch output file harr [ i ] . i = i ; pq . push ( harr [ i ] ) ; } int count = 0 ; // Now one by one get the minimum element from min heap and replace // it with next element. Run till all filled input files reach EOF while ( count != i ) { // Get the minimum element and store it in output file MinHeapNode root = pq . top ( ) ; pq . pop ( ) ; fprintf ( out , "%d " , root . element ) ; // Find the next element that should replace current root of heap. // The next element belongs to same input file as the current // minimum element. if ( fscanf ( in [ root . i ] , "%d " , &root . element ) != 1 ) { root . element = numeric_limits < int > :: max ( ) ; count ++ ; } // Replace root with next element of input file pq . push ( root ) ; } // close input and output files for ( int i = 0 ; i < k ; i ++ ) fclose ( in [ i ] ) ; fclose ( out ) ; } // Using a merge-sort algorithm, create the initial runs and divide them // evenly among the output files void createInitialRuns ( char * input_file , int run_size , int num_ways ) { // For big input file FILE * in = openFile ( input_file , "r" ) ; // output scratch files FILE * out [ num_ways ] ; char fileName [ 2 ] ; for ( int i = 0 ; i < num_ways ; i ++ ) { // convert i to string snprintf ( fileName , sizeof ( fileName ) , "%d" , i ) ; // Open output files in write mode. out [ i ] = openFile ( fileName , "w" ) ; } // allocate a dynamic array large enough to accommodate runs of // size run_size int * arr = new int [ run_size ] ; bool more_input = true ; int next_output_file = 0 ; int i ; while ( more_input ) { // write run_size elements into arr from input file for ( i = 0 ; i < run_size ; i ++ ) { if ( fscanf ( in , "%d " , &arr [ i ] ) != 1 ) { more_input = false ; break ; } } // sort array using merge sort sort ( arr , arr + i ) ; // write the records to the appropriate scratch output file // can't assume that the loop runs to run_size // since the last run's length may be less than run_size for ( int j = 0 ; j < i ; j ++ ) fprintf ( out [ next_output_file ] , "%d " , arr [ j ] ) ; next_output_file ++ ; } // deallocate memory delete arr ; // close input and output files for ( int i = 0 ; i < num_ways ; i ++ ) fclose ( out [ i ] ) ; fclose ( in ) ; } // Program to demonstrate external sorting int main ( ) { // No. of partitions of input file int num_ways = 10 ; // The size of each partition int run_size = 1000 ; char input_file [ ] = "input.txt" ; char output_file [ ] = "output.txt" ; FILE * in = openFile ( input_file , "w" ) ; srand ( time ( NULL ) ) ; // generate input for ( int i = 0 ; i < num_ways * run_size ; i ++ ) fprintf ( in , "%d " , rand ( ) ) ; fclose ( in ) ; // Read the input file, create the initial runs, // and assign the runs to the scratch output files createInitialRuns ( input_file , run_size , num_ways ) ; // Merge the runs using the K-way merging mergeFiles ( output_file , run_size , num_ways ) ; return 0 ; }



Please note that this code doesn’t work on online compilers as it requires file creation permissions. When run locally, it will produce sample input file “input.txt” with 10000 random numbers. It sorts the numbers and puts the sorted numbers in a file “output.txt”. It also generates files with names 1, 2, .. to store sorted runs.



Author: Aditya Goel



References: https://en.wikipedia.org/wiki/External_sorting









(74 votes, average: 4.57 out of 5)

Loading...

Thanks for reading.