Andrew Savchenko преди 9 години
ревизия
f0a0226c49
променени са 2 файла, в които са добавени 146 реда и са изтрити 0 реда
  1. 1 0
      README.md
  2. 145 0
      translit.c

+ 1 - 0
README.md

@@ -0,0 +1 @@
+A simple russian translit generator with multivariative generation. 

+ 145 - 0
translit.c

@@ -0,0 +1,145 @@
+/* 
+ * Transliterate UTF-8 russian input into all possible latin
+ * implementations, see hardcoded table below.
+ * Data is taken from stdin and is returned to stdout.
+ *
+ * Licensed under GPL-3.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+# define ALPHA_CNT 33
+# define TRANSL_CNT 6
+
+// conversion table
+// all lines must be NULL-terminated
+// first two rows must always be defined and not NULL
+const char *const ct[ALPHA_CNT][TRANSL_CNT]={
+    {"а", "a", NULL},
+    {"б", "b", NULL},
+    {"в", "v", NULL},
+    {"г", "g", "h", NULL},
+    {"д", "d", NULL},
+    {"е", "e", "je", "ye", NULL},
+    {"ё", "e", "jo", "yo", NULL},
+    {"ж", "j", "z", NULL},
+    {"з", "z", NULL},
+    {"и", "i", NULL},
+    {"й", "j", "i", NULL},
+    {"к", "k", NULL},
+    {"л", "l", NULL},
+    {"м", "m", NULL},
+    {"н", "n", NULL},
+    {"о", "o", NULL},
+    {"п", "p", NULL},
+    {"р", "r", "p", NULL},
+    {"с", "s", "c", NULL},
+    {"т", "t", NULL},
+    {"у", "u", "y", NULL},
+    {"ф", "f", NULL},
+    {"х", "h", "x", NULL},
+    {"ц", "ts", "s", NULL},
+    {"ч", "ch", "4", NULL},
+    {"ш", "sh", NULL},
+    {"щ", "sh", "sh'", "sh`", NULL},
+    {"ъ", "b", "'b", "`b", "", NULL},
+    {"ы", "y", "bl", "b1", NULL},
+    {"ь", "b", "", NULL},
+    {"э", "e", "3", NULL},
+    {"ю", "ju", "yu", "u", NULL},
+    {"я", "ya", "ja", NULL},
+};
+
+// recursion similar to fork + exec;
+// only forward scan of the input data
+void convline(char *in, char *out, size_t out_start, size_t out_len)
+{
+    size_t tr_len;  // length of translated sequence
+    char  *fork;    // forked output data (for multiple matches)
+    size_t fork_len;
+    int match;      // set this flag if we found something
+
+    // scan input line
+    while (*in != '\0' && *in != '\n' && *in != '\r')
+    {
+        match = 0;
+        for (int i=0; i<ALPHA_CNT; i++)
+        {
+            // we have an utf-8 symbol match
+            if (! strncmp(in, ct[i][0], 2))
+            {
+                // look for other possible translations first
+                for (int j=2; ct[i][j] && j<TRANSL_CNT; j++)
+                {
+                    // precreate fork buffer
+                    fork_len = out_len;
+                    tr_len = strlen(ct[i][j]);
+                    if (out_len - out_start <= tr_len)
+                        fork_len = fork_len*2;
+                    
+                    fork = malloc(sizeof(char)*fork_len);
+                    
+                    // copy base value
+                    strcpy(fork, out);
+                    // get value from conversion table
+                    strcpy(fork+out_start, ct[i][j]);
+                    // fork further expansion for an alternative translation
+                    convline(in+2, fork, out_start+tr_len, fork_len);
+                }
+
+                // do we need to grow out buffer?
+                tr_len = strlen(ct[i][1]);
+                if (out_len - out_start <= tr_len)
+                {
+                    out_len = out_len*2;
+                    out = realloc(out, sizeof(char)*out_len);
+                }
+                
+                // get value from conversion table
+                strcpy(out+out_start, ct[i][1]);
+
+                // go to next char
+                out_start += tr_len;
+                in+=2;
+                match = 1;
+                break;
+            }
+        }
+        if (match)
+            continue;
+
+        // we can be here only if all alpha matches above failed
+        // assume non-convertable single character and just copy it
+        if (out_len - out_start <= 1)
+        {
+            out_len = out_len*2;
+            out = realloc(out, sizeof(char)*out_len);
+        }
+        *(out+out_start++) = *(in++);
+    }
+    
+    // processing finished, outputting result
+    out[out_start] = '\0';
+    puts(out);
+    free(out);
+}
+
+int main()
+{
+    char *line = NULL;  // input line buffer
+    size_t len = 0;     //
+    char *out = NULL;   // output string
+    
+    while (getline(&line, &len, stdin) != -1)
+    {
+        // precreate buffer with a sane value
+        // must be freed after puts()
+        out = malloc(sizeof(char)*len);
+        // recursive line conversion
+        convline(line, out, 0, len);
+    }
+    
+    return 0;
+}