translit.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. /*
  2. * Transliterate UTF-8 russian input into all possible latin
  3. * implementations, see hardcoded table below.
  4. * Data is taken from stdin and is returned to stdout.
  5. *
  6. * Licensed under GPL-3.
  7. * ® 2012 - 2015 Andrew Savchenko <bircoph gmail com>
  8. */
  9. #include <stdio.h>
  10. #include <stdlib.h>
  11. #include <string.h>
  12. # define ALPHA_CNT 33
  13. # define TRANSL_CNT 6
  14. // conversion table
  15. // all lines must be NULL-terminated
  16. // first two rows must always be defined and not NULL
  17. const char *const ct[ALPHA_CNT][TRANSL_CNT]={
  18. {"а", "a", NULL},
  19. {"б", "b", NULL},
  20. {"в", "v", NULL},
  21. {"г", "g", "h", NULL},
  22. {"д", "d", NULL},
  23. {"е", "e", "je", "ye", NULL},
  24. {"ё", "e", "jo", "yo", NULL},
  25. {"ж", "j", "z", "zh", NULL},
  26. {"з", "z", NULL},
  27. {"и", "i", NULL},
  28. {"й", "j", "i", NULL},
  29. {"к", "k", NULL},
  30. {"л", "l", NULL},
  31. {"м", "m", NULL},
  32. {"н", "n", NULL},
  33. {"о", "o", NULL},
  34. {"п", "p", NULL},
  35. {"р", "r", "p", NULL},
  36. {"с", "s", "c", NULL},
  37. {"т", "t", NULL},
  38. {"у", "u", "y", NULL},
  39. {"ф", "f", NULL},
  40. {"х", "h", "x", NULL},
  41. {"ц", "ts", "s", NULL},
  42. {"ч", "ch", "4", NULL},
  43. {"ш", "sh", NULL},
  44. {"щ", "sh", "sh'", "sh`", NULL},
  45. {"ъ", "b", "'b", "`b", "", NULL},
  46. {"ы", "y", "bl", "b1", NULL},
  47. {"ь", "b", "", NULL},
  48. {"э", "e", "3", NULL},
  49. {"ю", "ju", "yu", "u", NULL},
  50. {"я", "ya", "ja", NULL},
  51. };
  52. // recursion similar to fork + exec;
  53. // only forward scan of the input data
  54. void convline(char *in, char *out, size_t out_start, size_t out_len)
  55. {
  56. size_t tr_len; // length of translated sequence
  57. char *fork; // forked output data (for multiple matches)
  58. size_t fork_len;
  59. int match; // set this flag if we found something
  60. // scan input line
  61. while (*in != '\0' && *in != '\n' && *in != '\r')
  62. {
  63. match = 0;
  64. for (int i=0; i<ALPHA_CNT; i++)
  65. {
  66. // we have an utf-8 symbol match
  67. if (! strncmp(in, ct[i][0], 2))
  68. {
  69. // look for other possible translations first
  70. for (int j=2; ct[i][j] && j<TRANSL_CNT; j++)
  71. {
  72. // precreate fork buffer
  73. fork_len = out_len;
  74. tr_len = strlen(ct[i][j]);
  75. if (out_len - out_start <= tr_len)
  76. fork_len = fork_len*2;
  77. fork = malloc(sizeof(char)*fork_len);
  78. // copy base value
  79. memcpy(fork, out, out_start);
  80. // get value from conversion table
  81. memcpy(fork+out_start, ct[i][j], tr_len);
  82. // fork further expansion for an alternative translation
  83. convline(in+2, fork, out_start+tr_len, fork_len);
  84. }
  85. // do we need to grow out buffer?
  86. tr_len = strlen(ct[i][1]);
  87. if (out_len - out_start <= tr_len)
  88. {
  89. out_len = out_len*2;
  90. out = realloc(out, sizeof(char)*out_len);
  91. }
  92. // get value from conversion table
  93. memcpy(out+out_start, ct[i][1], tr_len);
  94. // go to next char
  95. out_start += tr_len;
  96. in+=2;
  97. match = 1;
  98. break;
  99. }
  100. }
  101. if (match)
  102. continue;
  103. // we can be here only if all alpha matches above failed
  104. // assume non-convertable single character and just copy it
  105. if (out_len - out_start <= 1)
  106. {
  107. out_len = out_len*2;
  108. out = realloc(out, sizeof(char)*out_len);
  109. }
  110. *(out+out_start++) = *(in++);
  111. }
  112. // processing finished, outputting result
  113. out[out_start] = '\0';
  114. puts(out);
  115. free(out);
  116. }
  117. int main()
  118. {
  119. char *line = NULL; // input line buffer
  120. size_t len = 0; //
  121. char *out = NULL; // output string
  122. while (getline(&line, &len, stdin) != -1)
  123. {
  124. // precreate buffer with a sane value
  125. // must be freed after puts()
  126. out = malloc(sizeof(char)*len);
  127. // recursive line conversion
  128. convline(line, out, 0, len);
  129. }
  130. return 0;
  131. }