sortfile.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. /*-
  2. * Copyright (c) 2010
  3. * Thorsten Glaser <tg@mirbsd.org>
  4. *
  5. * Provided that these terms and disclaimer and all copyright notices
  6. * are retained or reproduced in an accompanying document, permission
  7. * is granted to deal in this work without restriction, including un-
  8. * limited rights to use, publicly perform, distribute, sell, modify,
  9. * merge, give away, or sublicence.
  10. *
  11. * This work is provided "AS IS" and WITHOUT WARRANTY of any kind, to
  12. * the utmost extent permitted by applicable law, neither express nor
  13. * implied; without malicious intent or gross negligence. In no event
  14. * may a licensor, author or contributor be held liable for indirect,
  15. * direct, other damage, loss, or other issues arising in any way out
  16. * of dealing in the work, even if advised of the possibility of such
  17. * damage or existence of a defect, except proven that it results out
  18. * of said person's immediate fault when using the work as intended.
  19. */
  20. #include <sys/types.h>
  21. #include <sys/mman.h>
  22. #include <sys/stat.h>
  23. #include <err.h>
  24. #include <fcntl.h>
  25. #include <limits.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #include <unistd.h>
  29. struct ptrsize {
  30. const char *ptr;
  31. size_t size;
  32. };
  33. static void *xrecalloc(void *, size_t, size_t);
  34. static int cmpfn(const void *, const void *);
  35. #define MUL_NO_OVERFLOW (1UL << (sizeof (size_t) * 8 / 2))
  36. #ifndef SIZE_MAX
  37. #ifdef SIZE_T_MAX
  38. #define SIZE_MAX SIZE_T_MAX
  39. #else
  40. #define SIZE_MAX ((size_t)-1)
  41. #endif
  42. #endif
  43. #if !defined(MAP_FAILED)
  44. /* XXX imake style */
  45. # if defined(__linux)
  46. #define MAP_FAILED ((void *)-1)
  47. # elif defined(__bsdi__) || defined(__osf__) || defined(__ultrix)
  48. #define MAP_FAILED ((caddr_t)-1)
  49. # endif
  50. #endif
  51. static void *
  52. xrecalloc(void *ptr, size_t nmemb, size_t size)
  53. {
  54. void *rv;
  55. if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
  56. nmemb > 0 && SIZE_MAX / nmemb < size)
  57. errx(1, "attempted integer overflow: %zu * %zu", nmemb, size);
  58. size *= nmemb;
  59. if ((rv = realloc(ptr, size)) == NULL)
  60. err(1, "cannot allocate %zu bytes", size);
  61. return (rv);
  62. }
  63. int
  64. sortfile(char *infile, char *outfile)
  65. {
  66. int fd, fdout;
  67. size_t fsz, asz, anents;
  68. char *cp, *thefile, *endfile;
  69. struct ptrsize *thearray;
  70. if ((fd = open(infile, O_RDONLY)) < 0)
  71. err(1, "open: %s", infile);
  72. else {
  73. struct stat sb;
  74. /* reasonable maximum size: 3/4 of SIZE_MAX */
  75. fsz = (SIZE_MAX / 2) + (SIZE_MAX / 4);
  76. if (fstat(fd, &sb))
  77. err(1, "stat: %s", infile);
  78. if (sb.st_size > fsz)
  79. errx(1, "file %s too big, %llu > %zu", infile,
  80. (unsigned long long)sb.st_size, fsz);
  81. fsz = (size_t)sb.st_size;
  82. }
  83. if ((thefile = mmap(NULL, fsz, PROT_READ, MAP_FILE | MAP_PRIVATE,
  84. fd, (off_t)0)) == MAP_FAILED)
  85. err(1, "mmap %zu bytes from %s", fsz, infile);
  86. /* last valid byte in the file, must be newline anyway */
  87. endfile = thefile + fsz - 1;
  88. thearray = xrecalloc(NULL, (asz = 8), sizeof(thearray[0]));
  89. thearray[(anents = 0)].ptr = cp = thefile;
  90. while ((cp = memchr(cp, '\n', endfile - cp)) != NULL) {
  91. /* byte after the \n */
  92. if (++cp > endfile)
  93. /* end of file */
  94. break;
  95. thearray[anents].size = cp - thearray[anents].ptr;
  96. if (++anents == asz)
  97. /* resize array */
  98. thearray = xrecalloc(thearray, (asz <<= 1),
  99. sizeof(thearray[0]));
  100. thearray[anents].ptr = cp;
  101. }
  102. thearray[anents].size = endfile - thearray[anents].ptr + 1;
  103. qsort(thearray, ++anents, sizeof(thearray[0]), cmpfn);
  104. if ((fdout = open(outfile, O_WRONLY | O_CREAT, S_IRWXU)) < 0)
  105. err(1, "open: %s", outfile);
  106. else {
  107. for (asz = 0; asz < anents; ++asz)
  108. if ((size_t)write(fdout, thearray[asz].ptr,
  109. thearray[asz].size) != thearray[asz].size)
  110. err(1, "write %zu bytes", thearray[asz].size);
  111. }
  112. if (munmap(thefile, fsz))
  113. warn("munmap");
  114. free(thearray);
  115. close(fd);
  116. return (0);
  117. }
  118. static int
  119. cmpfn(const void *p1, const void *p2)
  120. {
  121. int rv;
  122. const struct ptrsize *a1 = (const struct ptrsize *)p1;
  123. const struct ptrsize *a2 = (const struct ptrsize *)p2;
  124. if ((rv = memcmp(a1->ptr, a2->ptr, (a1->size > a2->size ?
  125. a2->size : a1->size) - /* '\n' */ 1)) != 0)
  126. /* unequal in the common part */
  127. return (rv);
  128. /* shorter string is smaller */
  129. return (a1->size > a2->size ? 1 : a1->size == a2->size ? 0 : -1);
  130. }