123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- /*
- * Transliterate UTF-8 russian input into all possible latin
- * implementations, see hardcoded table below.
- * Data is taken from stdin and is returned to stdout.
- *
- * Licensed under GPL-3.
- * ® 2012 - 2015 Andrew Savchenko <bircoph gmail com>
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- # define ALPHA_CNT 33
- # define TRANSL_CNT 6
- // conversion table
- // all lines must be NULL-terminated
- // first two rows must always be defined and not NULL
- const char *const ct[ALPHA_CNT][TRANSL_CNT]={
- {"а", "a", NULL},
- {"б", "b", NULL},
- {"в", "v", NULL},
- {"г", "g", "h", NULL},
- {"д", "d", NULL},
- {"е", "e", "je", "ye", NULL},
- {"ё", "e", "jo", "yo", NULL},
- {"ж", "j", "z", "zh", NULL},
- {"з", "z", NULL},
- {"и", "i", NULL},
- {"й", "j", "i", NULL},
- {"к", "k", NULL},
- {"л", "l", NULL},
- {"м", "m", NULL},
- {"н", "n", NULL},
- {"о", "o", NULL},
- {"п", "p", NULL},
- {"р", "r", "p", NULL},
- {"с", "s", "c", NULL},
- {"т", "t", NULL},
- {"у", "u", "y", NULL},
- {"ф", "f", NULL},
- {"х", "h", "x", NULL},
- {"ц", "ts", "s", NULL},
- {"ч", "ch", "4", NULL},
- {"ш", "sh", NULL},
- {"щ", "sh", "sh'", "sh`", NULL},
- {"ъ", "b", "'b", "`b", "", NULL},
- {"ы", "y", "bl", "b1", NULL},
- {"ь", "b", "", NULL},
- {"э", "e", "3", NULL},
- {"ю", "ju", "yu", "u", NULL},
- {"я", "ya", "ja", NULL},
- };
- // recursion similar to fork + exec;
- // only forward scan of the input data
- void convline(char *in, char *out, size_t out_start, size_t out_len)
- {
- size_t tr_len; // length of translated sequence
- char *fork; // forked output data (for multiple matches)
- size_t fork_len;
- int match; // set this flag if we found something
- // scan input line
- while (*in != '\0' && *in != '\n' && *in != '\r')
- {
- match = 0;
- for (int i=0; i<ALPHA_CNT; i++)
- {
- // we have an utf-8 symbol match
- if (! strncmp(in, ct[i][0], 2))
- {
- // look for other possible translations first
- for (int j=2; ct[i][j] && j<TRANSL_CNT; j++)
- {
- // precreate fork buffer
- fork_len = out_len;
- tr_len = strlen(ct[i][j]);
- if (out_len - out_start <= tr_len)
- fork_len = fork_len*2;
-
- fork = malloc(sizeof(char)*fork_len);
-
- // copy base value
- memcpy(fork, out, out_start);
- // get value from conversion table
- memcpy(fork+out_start, ct[i][j], tr_len);
- // fork further expansion for an alternative translation
- convline(in+2, fork, out_start+tr_len, fork_len);
- }
- // do we need to grow out buffer?
- tr_len = strlen(ct[i][1]);
- if (out_len - out_start <= tr_len)
- {
- out_len = out_len*2;
- out = realloc(out, sizeof(char)*out_len);
- }
-
- // get value from conversion table
- memcpy(out+out_start, ct[i][1], tr_len);
- // go to next char
- out_start += tr_len;
- in+=2;
- match = 1;
- break;
- }
- }
- if (match)
- continue;
- // we can be here only if all alpha matches above failed
- // assume non-convertable single character and just copy it
- if (out_len - out_start <= 1)
- {
- out_len = out_len*2;
- out = realloc(out, sizeof(char)*out_len);
- }
- *(out+out_start++) = *(in++);
- }
-
- // processing finished, outputting result
- out[out_start] = '\0';
- puts(out);
- free(out);
- }
- int main()
- {
- char *line = NULL; // input line buffer
- size_t len = 0; //
- char *out = NULL; // output string
-
- while (getline(&line, &len, stdin) != -1)
- {
- // precreate buffer with a sane value
- // must be freed after puts()
- out = malloc(sizeof(char)*len);
- // recursive line conversion
- convline(line, out, 0, len);
- }
-
- return 0;
- }
|