input_file  : Name of input file. input.txt
output_file : Name of output file, output.txt
run_size : Size of a run (can fit in RAM)
num_ways : Number of runs to be merged



  1. 读取input_file,以便一次最多读取“ run_size”个元素。对数组中的每次运行都执行以下操作。
  2. 使用MergeSort对运行进行排序。
  3. 将排序后的数组存储在文件中。让我们为第一个文件说“ i”。
  4. 使用讨论的方法合并排序的文件合并k个排序的数组


// C++ program to implement
// external sorting using
// merge sort
using namespace std;
struct MinHeapNode {
    // The element to be stored
    int element;
    // index of the array from which
    // the element is taken
    int i;
// Prototype of a utility function
// to swap two min heap nodes
void swap(MinHeapNode* x, MinHeapNode* y);
// A class for Min Heap
class MinHeap {
    // pointer to array of elements in heap
    MinHeapNode* harr;
    // size of min heap
    int heap_size;
    // Constructor: creates a min
    // heap of given size
    MinHeap(MinHeapNode a[], int size);
    // to heapify a subtree with
    // root at given index
    void MinHeapify(int);
    // to get index of left child
    // of node at index i
    int left(int i) { return (2 * i + 1); }
    // to get index of right child
    // of node at index i
    int right(int i) { return (2 * i + 2); }
    // to get the root
    MinHeapNode getMin() { return harr[0]; }
    // to replace root with new node
    // x and heapify() new root
    void replaceMin(MinHeapNode x)
        harr[0] = x;
// Constructor: Builds a heap from
// a given array a[] of given size
MinHeap::MinHeap(MinHeapNode a[], int size)
    heap_size = size;
    harr = a; // store address of array
    int i = (heap_size - 1) / 2;
    while (i >= 0) {
// A recursive method to heapify
// a subtree with root
// at given index. This method
// assumes that the
// subtrees are already heapified
void MinHeap::MinHeapify(int i)
    int l = left(i);
    int r = right(i);
    int smallest = i;
    if (l < heap_size && harr[l].element < harr[i].element)
        smallest = l;
    if (r < heap_size && harr[r].element < harr[smallest].element)
        smallest = r;
    if (smallest != i) {
        swap(&harr[i], &harr[smallest]);
// A utility function to swap two elements
void swap(MinHeapNode* x, MinHeapNode* y)
    MinHeapNode temp = *x;
    *x = *y;
    *y = temp;
// Merges two subarrays of arr[].
// First subarray is arr[l..m]
// Second subarray is arr[m+1..r]
void merge(int arr[], int l, int m, int r)
    int i, j, k;
    int n1 = m - l + 1;
    int n2 = r - m;
    /* create temp arrays */
    int L[n1], R[n2];
    /* Copy data to temp arrays L[] and R[] */
    for (i = 0; i < n1; i++)
        L[i] = arr[l + i];
    for (j = 0; j < n2; j++)
        R[j] = arr[m + 1 + j];
    /* Merge the temp arrays back into arr[l..r]*/
    // Initial index of first subarray
    i = 0;
    // Initial index of second subarray
    j = 0;
    // Initial index of merged subarray
    k = l;
    while (i < n1 && j < n2) {
        if (L[i] <= R[j])
            arr[k++] = L[i++];
            arr[k++] = R[j++];
    /* Copy the remaining elements of L[],
        if there are any */
    while (i < n1)
        arr[k++] = L[i++];
    /* Copy the remaining elements of R[],
        if there are any */
    while (j < n2)
        arr[k++] = R[j++];
/* l is for left index and r is right index of the
   sub-array of arr to be sorted */
void mergeSort(int arr[], int l, int r)
    if (l < r) {
        // Same as (l+r)/2, but avoids overflow for
        // large l and h
        int m = l + (r - l) / 2;
        // Sort first and second halves
        mergeSort(arr, l, m);
        mergeSort(arr, m + 1, r);
        merge(arr, l, m, r);
FILE* openFile(char* fileName, char* mode)
    FILE* fp = fopen(fileName, mode);
    if (fp == NULL) {
        perror("Error while opening the file.\n");
    return fp;
// Merges k sorted files. Names of files are assumed
// to be 1, 2, 3, ... k
void mergeFiles(char* output_file, int n, int k)
    FILE* in[k];
    for (int i = 0; i < k; i++) {
        char fileName[2];
        // convert i to string
        snprintf(fileName, sizeof(fileName),
                 "%d", i);
        // Open output files in read mode.
        in[i] = openFile(fileName, "r");
    FILE* out = openFile(output_file, "w");
    // Create a min heap with k heap
    // nodes. Every heap node
    // has first element of scratch
    // output file
    MinHeapNode* harr = new MinHeapNode[k];
    int i;
    for (i = 0; i < k; i++) {
        // break if no output file is empty and
        // index i will be no. of input files
        if (fscanf(in[i], "%d ", &harr[i].element) != 1)
        // Index of scratch output file
        harr[i].i = i;
    // Create the heap
    MinHeap hp(harr, i);
    int count = 0;
    // Now one by one get the
    // minimum element from min
    // heap and replace it with
    // next element.
    // run till all filled input
    // files reach EOF
    while (count != i) {
        // Get the minimum element
        // and store it in output file
        MinHeapNode root = hp.getMin();
        fprintf(out, "%d ", root.element);
        // Find the next element that
        // will replace current
        // root of heap. The next element
        // belongs to same
        // input file as the current min element.
        if (fscanf(in[root.i], "%d ",
            != 1) {
            root.element = INT_MAX;
        // Replace root with next
        // element of input file
    // close input and output files
    for (int i = 0; i < k; i++)
// Using a merge-sort algorithm,
// create the initial runs
// and divide them evenly among
// the output files
void createInitialRuns(
    char* input_file, int run_size,
    int num_ways)
    // For big input file
    FILE* in = openFile(input_file, "r");
    // output scratch files
    FILE* out[num_ways];
    char fileName[2];
    for (int i = 0; i < num_ways; i++) {
        // convert i to string
        snprintf(fileName, sizeof(fileName),
                 "%d", i);
        // Open output files in write mode.
        out[i] = openFile(fileName, "w");
    // allocate a dynamic array large enough
    // to accommodate runs of size run_size
    int* arr = (int*)malloc(
        run_size * sizeof(int));
    bool more_input = true;
    int next_output_file = 0;
    int i;
    while (more_input) {
        // write run_size elements
        // into arr from input file
        for (i = 0; i < run_size; i++) {
            if (fscanf(in, "%d ", &arr[i]) != 1) {
                more_input = false;
        // sort array using merge sort
        mergeSort(arr, 0, i - 1);
        // write the records to the
        // appropriate scratch output file
        // can't assume that the loop
        // runs to run_size
        // since the last run's length
        // may be less than run_size
        for (int j = 0; j < i; j++)
                    "%d ", arr[j]);
    // close input and output files
    for (int i = 0; i < num_ways; i++)
// For sorting data stored on disk
void externalSort(
    char* input_file, char* output_file,
    int num_ways, int run_size)
    // read the input file,
    // create the initial runs,
    // and assign the runs to
    // the scratch output files
                      run_size, num_ways);
    // Merge the runs using
    // the K-way merging
    mergeFiles(output_file, run_size, num_ways);
// Driver program to test above
int main()
    // No. of Partitions of input file.
    int num_ways = 10;
    // The size of each partition
    int run_size = 1000;
    char input_file[] = "input.txt";
    char output_file[] = "output.txt";
    FILE* in = openFile(input_file, "w");
    // generate input
    for (int i = 0; i < num_ways * run_size; i++)
        fprintf(in, "%d ", rand());
    externalSort(input_file, output_file, num_ways,
    return 0;


  • 时间复杂度: O(n + run_size log run_size)。
    合并排序所花费的时间为O(nlogn),但最多有run_size个元素。因此,时间复杂度为O(run_size log run_size),然后合并排序后的数组,时间复杂度为O(n)。因此,总体时间复杂度为O(n + run_size log run_size)。
  • 辅助空间: O(run_size)。

注意:此代码在在线编译器上不起作用,因为它需要文件创建权限。在本地计算机上运行时,它将生成带有10000个随机数的示例输入文件“ input.txt”。它对数字进行排序,然后将排序后的数字放入“ output.txt”文件中。它还会生成名称为1、2 …的文件,以存储排序的运行。


