translit.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. /*
  2. * Transliterate UTF-8 russian input into all possible latin
  3. * implementations, see hardcoded table below.
  4. * Data is taken from stdin and is returned to stdout.
  5. *
  6. * Licensed under GPL-3.
  7. */
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. # define ALPHA_CNT 33
  12. # define TRANSL_CNT 6
  13. // conversion table
  14. // all lines must be NULL-terminated
  15. // first two rows must always be defined and not NULL
  16. const char *const ct[ALPHA_CNT][TRANSL_CNT]={
  17. {"а", "a", NULL},
  18. {"б", "b", NULL},
  19. {"в", "v", NULL},
  20. {"г", "g", "h", NULL},
  21. {"д", "d", NULL},
  22. {"е", "e", "je", "ye", NULL},
  23. {"ё", "e", "jo", "yo", NULL},
  24. {"ж", "j", "z", NULL},
  25. {"з", "z", NULL},
  26. {"и", "i", NULL},
  27. {"й", "j", "i", NULL},
  28. {"к", "k", NULL},
  29. {"л", "l", NULL},
  30. {"м", "m", NULL},
  31. {"н", "n", NULL},
  32. {"о", "o", NULL},
  33. {"п", "p", NULL},
  34. {"р", "r", "p", NULL},
  35. {"с", "s", "c", NULL},
  36. {"т", "t", NULL},
  37. {"у", "u", "y", NULL},
  38. {"ф", "f", NULL},
  39. {"х", "h", "x", NULL},
  40. {"ц", "ts", "s", NULL},
  41. {"ч", "ch", "4", NULL},
  42. {"ш", "sh", NULL},
  43. {"щ", "sh", "sh'", "sh`", NULL},
  44. {"ъ", "b", "'b", "`b", "", NULL},
  45. {"ы", "y", "bl", "b1", NULL},
  46. {"ь", "b", "", NULL},
  47. {"э", "e", "3", NULL},
  48. {"ю", "ju", "yu", "u", NULL},
  49. {"я", "ya", "ja", NULL},
  50. };
  51. // recursion similar to fork + exec;
  52. // only forward scan of the input data
  53. void convline(char *in, char *out, size_t out_start, size_t out_len)
  54. {
  55. size_t tr_len; // length of translated sequence
  56. char *fork; // forked output data (for multiple matches)
  57. size_t fork_len;
  58. int match; // set this flag if we found something
  59. // scan input line
  60. while (*in != '\0' && *in != '\n' && *in != '\r')
  61. {
  62. match = 0;
  63. for (int i=0; i<ALPHA_CNT; i++)
  64. {
  65. // we have an utf-8 symbol match
  66. if (! strncmp(in, ct[i][0], 2))
  67. {
  68. // look for other possible translations first
  69. for (int j=2; ct[i][j] && j<TRANSL_CNT; j++)
  70. {
  71. // precreate fork buffer
  72. fork_len = out_len;
  73. tr_len = strlen(ct[i][j]);
  74. if (out_len - out_start <= tr_len)
  75. fork_len = fork_len*2;
  76. fork = malloc(sizeof(char)*fork_len);
  77. // copy base value
  78. strcpy(fork, out);
  79. // get value from conversion table
  80. strcpy(fork+out_start, ct[i][j]);
  81. // fork further expansion for an alternative translation
  82. convline(in+2, fork, out_start+tr_len, fork_len);
  83. }
  84. // do we need to grow out buffer?
  85. tr_len = strlen(ct[i][1]);
  86. if (out_len - out_start <= tr_len)
  87. {
  88. out_len = out_len*2;
  89. out = realloc(out, sizeof(char)*out_len);
  90. }
  91. // get value from conversion table
  92. strcpy(out+out_start, ct[i][1]);
  93. // go to next char
  94. out_start += tr_len;
  95. in+=2;
  96. match = 1;
  97. break;
  98. }
  99. }
  100. if (match)
  101. continue;
  102. // we can be here only if all alpha matches above failed
  103. // assume non-convertable single character and just copy it
  104. if (out_len - out_start <= 1)
  105. {
  106. out_len = out_len*2;
  107. out = realloc(out, sizeof(char)*out_len);
  108. }
  109. *(out+out_start++) = *(in++);
  110. }
  111. // processing finished, outputting result
  112. out[out_start] = '\0';
  113. puts(out);
  114. free(out);
  115. }
  116. int main()
  117. {
  118. char *line = NULL; // input line buffer
  119. size_t len = 0; //
  120. char *out = NULL; // output string
  121. while (getline(&line, &len, stdin) != -1)
  122. {
  123. // precreate buffer with a sane value
  124. // must be freed after puts()
  125. out = malloc(sizeof(char)*len);
  126. // recursive line conversion
  127. convline(line, out, 0, len);
  128. }
  129. return 0;
  130. }