initial import

(automatically generated log message)
author: Suren A. Chilingaryan <csa@dside.dyndns.org> 2005-06-16 23:19:27 +0000
committer: Suren A. Chilingaryan <csa@dside.dyndns.org> 2005-06-16 23:19:27 +0000
commit: 70fbe7822024d0acc68df3607ff25bf8d7a71751 (patch)
tree: 553cd2ef8cfc936fc890113596db2c4478fe5163 /statgen
download: librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.gz
librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.bz2
librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.xz
librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.zip
8 files changed, 525 insertions, 0 deletions
diff --git a/statgen/Makefile b/statgen/Makefile
new file mode 100644
index 0000000..76251bd
--- /dev/null
+++ b/statgen/Makefile
@@ -0,0 +1,10 @@
+all: generate ascii test upper debug 
+generate: generate.c
+	gcc -lm -o generate generate.c
+ascii: ascii.c
+	gcc -o ascii ascii.c
+test: test.c charset_auto_russian.h russian_table.h
+	gcc -o test test.c
+debug: debug.c charset_auto_russian.h russian_table.h
+	gcc -o debug debug.c
+upper: upper.c
+\ No newline at end of file
diff --git a/statgen/ascii.c b/statgen/ascii.c
new file mode 100644
index 0000000..73dd6e4
--- /dev/null
+++ b/statgen/ascii.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+main() {
+    int i;
+    
+    for (i=32;i<256;i++) {
+	if ((i%8)==0) printf("\n");
+	printf("%3.u %2.x %c   ",i,i,i);
+    }
+    printf("\n\n");
+}
diff --git a/statgen/debug.c b/statgen/debug.c
new file mode 100644
index 0000000..85b950b
--- /dev/null
+++ b/statgen/debug.c
@@ -0,0 +1,33 @@
+#include <stdio.h>
+#define _AUTO_DEBUG
+#include "charset_auto_russian.h"
+
+
+main(int argc, char *argv[]) {
+    FILE *f;
+    int len,st;
+    char word[256];
+
+
+    if (argc!=2) {
+	printf("Usage: %s <file name>\n",argv[0]);
+	exit(0);
+    }
+
+    f=fopen(argv[1],"r");
+    if (!f) {
+	printf("Failed to open specified file. Check permissions!\n");
+	exit(1);
+    }
+    
+    while(!feof(f)) {
+	fscanf(f,"%s",&word);
+//	len=strlen(word)-1;
+	for(st=0;word[st]=='"'||word[st]=='\''||word[st]=='(';st++);
+	for(len=strlen(word)-1;word[len]==','||word[len]=='.'||word[len]=='!'||word[len]=='?'||word[len]==';'||word[len]=='-'||word[len]==':'||word[len]=='"'||word[len]=='\''||word[len]==')';len--);
+	if (len<5) continue;
+	else word[len+1]=0;
+
+	autocharset_russian_uc(word+st,len+1-st);
+    }
+}
diff --git a/statgen/doit b/statgen/doit
new file mode 100755
index 0000000..c38fe39
--- /dev/null
+++ b/statgen/doit
@@ -0,0 +1,42 @@
+#! /bin/bash
+
+if [ -z "$1" ]; then
+    echo "Usage: doit <file name>"
+    exit
+fi
+
+# In some CP866 texts used "Yo" and "N" simbols from CP1251 encoding. This fixes it.
+dos2unix -U $1
+
+cat $1 | sed -e "s/¸/ñ/g;s/¹/N/g;s/°/ø/g" | iconv -f CP866 -t KOI8-R > $1.koi
+cat $1 | sed -e "s/¸/ñ/g;s/¹/N/g;s/°/ø/g" | iconv -f CP866 -t CP1251 > $1.win
+cat $1 | sed -e "s/¸/ñ/g;s/¹/N/g;s/°/ø/g"  > $1.alt
+
+#cat $1 | sed -e "s/¸/ñ/g;s/¹/ü/g;s/°/ø/g" | iconv -f CP866 -t UTF-8 > $1.utf
+
+export LC_CTYPE="ru_RU.KOI8-R"
+./generate $1.koi koi > russian_table.h  2> header1.tmp
+./generate $1.koi win >> russian_table.h  2> header2.tmp
+./generate $1.koi alt >> russian_table.h  2> header3.tmp
+#./generate $1.win win >> russian_table.h 2> header2.tmp
+#./generate $1 alt >> russian_table.h 2> header3.tmp
+
+cmp header1.tmp header2.tmp
+if [ $? -ne 0 ]; then
+    echo "Different number items in win & koi tables. Strange..."
+    rm -f russian_table.h
+else
+    cmp header1.tmp header3.tmp
+    if [ $? -ne 0 ]; then
+	echo "Different number items in win & koi tables. Strange..."
+	rm -f russian_table.h
+    else
+	cat header1.tmp >> russian_table.h
+    fi
+fi
+
+rm -f header?.tmp
+rm -f $1.koi
+#rm -f $1.win
+#rm -f $1.alt
+#rm -f $1.utf
diff --git a/statgen/generate.c b/statgen/generate.c
new file mode 100644
index 0000000..838c0a7
--- /dev/null
+++ b/statgen/generate.c
@@ -0,0 +1,258 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <locale.h>
+#include <math.h>
+#include <iconv.h>
+#include <langinfo.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#define first_char 128
+#define last_char 255
+
+#define original_first_char 192
+#define original_last_char 255
+
+#define chars_number (last_char-first_char+1)
+#define array_size (chars_number*chars_number)
+
+struct array_pos {
+    int ll;
+    int uu;
+    int lu;
+    int ul;
+};
+
+struct pstat {
+    unsigned long p;
+    unsigned long s;
+    unsigned long e;
+};
+
+iconv_t icnv=(iconv_t)-1;
+
+int end_symbol(char ch) {
+    if (ch=='\r'||ch=='\n'||ch==0||ch==' '||ch=='\t'||ch==','||ch=='.'||ch=='!'||ch=='?'||ch==';'||ch=='-'||ch==':'||ch=='"'||ch=='\''||ch==')') return 1;
+    return 0;
+}
+
+int start_symbol(char ch) {
+    if ((ch=='\t')||ch=='\r'||ch=='\n'||(ch==' ')||(ch=='(')||(ch=='"')||(ch=='\'')) return 1;
+    return 0;
+}    
+
+
+unsigned char convert_char(unsigned char c) {
+    char r;
+    char *pr, *pc;
+    size_t lr=1,lc=1;
+    pr=&r;pc=&c;
+    
+    if (icnv == (iconv_t)-1) return c;
+    if (iconv(icnv,&pc,&lc,&pr,&lr)<0) {
+	printf("Error converting characters!\n");
+	exit(1);
+    }
+    return r;
+}
+
+int get_array_pos(struct array_pos *pos, int a, int b) {
+    int la,ua,lb,ub;
+    if ((a<original_first_char)||(a>original_last_char)) return -1;
+    if ((b<original_first_char)||(b>original_last_char)) return -1;
+    
+    la=tolower(a);
+    ua=toupper(a);
+    lb=tolower(b);
+    ub=toupper(b);
+
+    if ((la<original_first_char)||(la>original_last_char)) la=a;
+    if ((lb<original_first_char)||(lb>original_last_char)) lb=b;
+    if ((ua<original_first_char)||(ua>original_last_char)) ua=a;
+    if ((ub<original_first_char)||(ub>original_last_char)) ub=b;
+    
+    la=convert_char(la);
+    ua=convert_char(ua);
+    lb=convert_char(lb);
+    ub=convert_char(ub);
+    
+//    la=a;lb=b;ua=a;ub=b;
+    
+    pos->ll=(la-first_char)*chars_number+(lb-first_char);
+    if (la!=ua) {
+	pos->ul=(ua-first_char)*chars_number+(lb-first_char);
+    } else {
+	pos->ul=-1;
+    } 
+    if (lb!=ub) {
+	pos->lu=(la-first_char)*chars_number+(ub-first_char);
+    }
+    else {
+	pos->lu=-1;
+    }
+    if ((lb!=ub)&&(la!=ua)) {
+	pos->uu=(ua-first_char)*chars_number+(ub-first_char);
+    } else {
+	pos->uu=-1;
+    }
+    return 0;
+}
+
+
+struct pstat *analyze(const unsigned char *text, unsigned long length) {
+    struct pstat *a;
+    unsigned long i;
+    struct array_pos pos;
+    
+    a=(struct pstat*)malloc(array_size*sizeof(struct pstat));
+    if (!a) return NULL;
+
+    for (i=0;i<array_size;i++) {
+	a[i].p=0;
+	a[i].s=0;
+	a[i].e=0;
+    }
+	
+    for (i=1;i<length;i++) {
+	if (get_array_pos(&pos,text[i-1],text[i])>=0) {
+	    if (pos.ll>=0) {
+		if ((i==1)||(start_symbol(text[i-2]))) a[pos.ll].s++;
+		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.ll].e++;
+		else a[pos.ll].p++;
+	    }
+	    if (pos.ul>=0) {
+		if ((i==1)||(start_symbol(text[i-2]))) a[pos.ul].s++;
+		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.ul].e++;
+		else a[pos.ul].p++;
+	    }
+//	    if (pos.lu>=0) {
+//		if ((i==1)||(start_symbol(text[i-2]))) a[pos.lu].s++;
+//		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.lu].e++;
+//		else a[pos.lu].p++;
+//	    }
+	    if (pos.uu>=0) {
+		if ((i==1)||(start_symbol(text[i-2]))) a[pos.uu].s++;
+		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.uu].e++;
+		else a[pos.uu].p++;
+	    }
+	}
+    }
+    return a;
+}
+
+
+int print(struct pstat *a) {
+    int i,j,k,n;
+    
+    for (i=first_char,k=0,n=0;i<=last_char;i++)
+	for (j=first_char;j<=last_char;j++,k++) {
+	    if ((a[k].p)||(a[k].s)||(a[k].e)) {
+		if ((n)&&(n%8==0)) printf(",\n");
+		else if (n) printf(", ");
+		printf("{'%c','%c',%lf,%lf,%lf}",i,j,a[k].p?log10(a[k].p):-2,a[k].s?log10(a[k].s):-2,a[k].e?log10(a[k].e):-2);
+		n++;
+	    }
+	}
+    if ((n%8)!=1) printf("\n");
+    return n;
+}
+
+
+unsigned long npow(unsigned long n) {
+    unsigned long res=2;
+    while (res<=n) res*=2;
+    return res;
+}
+
+main(int argc, char *argv[]) {
+    FILE *f;
+    struct stat st;
+    unsigned char *text;
+    unsigned long len;
+    struct pstat *a;
+    int num;
+    long i,sum;
+    char locale[32];
+
+
+    if (argc!=3) {
+	printf("Usage: %s <file name> <encoding>\n",argv[0]);
+	exit(0);
+    }
+
+    if (strlen(argv[2])>12) {
+	printf("Invalid encoding(%s) specified!\n",argv[2]);
+	exit(1);
+    }
+    
+    if ((!strcasecmp(argv[2],"koi"))||(!strcasecmp(argv[2],"koi8"))||(!strcasecmp(argv[2],"koi-8"))||(!strcasecmp(argv[2],"koi8-r")))
+    	sprintf(locale,"%s","KOI8-R");
+    else if ((!strcasecmp(argv[2],"win"))||(!strcasecmp(argv[2],"cp1251"))||(!strcasecmp(argv[2],"cp-1251"))||(!strcasecmp(argv[2],"win1251"))||(!strcasecmp(argv[2],"win-1251")))
+	sprintf(locale,"%s","CP1251");
+    else if ((!strcasecmp(argv[2],"alt"))||(!strcasecmp(argv[2],"cp866"))||(!strcasecmp(argv[2],"cp-866"))||(!strcasecmp(argv[2],"ibm866"))||(!strcasecmp(argv[2],"ibm-866")))
+	sprintf(locale,"%s","IBM866");
+    else
+	sprintf(locale,"%s",argv[2]);
+    
+    if (!setlocale(LC_CTYPE,"")) {
+	printf("Can't set locale!\n");
+	exit(1);
+    }
+
+    if (strcmp(locale,nl_langinfo(CODESET))) {
+	if ((icnv=iconv_open(locale,nl_langinfo(CODESET)))<0) {
+	    printf("Can't initialize iconv!\n");
+	    exit(1);
+	}
+    }
+    
+    
+    if (stat(argv[1],&st)) {
+	printf("Specified file can't be stated!\n");
+	iconv_close(icnv);
+	exit(1);
+    }
+    
+    if (!S_ISREG(st.st_mode)) {
+	printf("Specified file isn't regular file!\n");
+	iconv_close(icnv);
+	exit(1);
+    }
+    
+    text=(unsigned char*)malloc(st.st_size);
+    if (!text) {
+	printf("Can't allocate %lu bytes of memory!\n",st.st_size);
+	iconv_close(icnv);
+	exit(1);
+    }
+    
+    f=fopen(argv[1],"r");
+    if (!f) {
+	printf("Failed to open specified file. Check permissions!\n");
+	free(text);
+	iconv_close(icnv);
+	exit(1);
+    }
+    if (fread(text,1,st.st_size,f)!=st.st_size) {
+	printf("Problem reading specified file!\n");
+	free(text);
+	fclose(f);
+	iconv_close(icnv);
+	exit(1);
+    }
+    fclose(f);
+    
+    a=analyze(text,st.st_size);
+    if (a) {
+	printf("static const lng_stat2 enc_%s[]={\n",argv[2]);
+	num=print(a);
+	printf("};\n\n");
+	free(a);
+	fprintf(stderr,"static unsigned int indexes2=%lu;\n",num);
+	fprintf(stderr,"static unsigned int npow2=%lu;\n",npow(num));
+    } else printf("Failed to allocate %lu bytes of memory!\n",array_size*sizeof(struct pstat));
+
+    free(text);
+    iconv_close(icnv);
+}
diff --git a/statgen/test.c b/statgen/test.c
new file mode 100644
index 0000000..936b491
--- /dev/null
+++ b/statgen/test.c
@@ -0,0 +1,84 @@
+#include <stdio.h>
+#include "charset_auto_russian.h"
+
+main(int argc, char *argv[]) {
+    FILE *f;
+    int len,st;
+    char word[256],phrase[8192];
+    unsigned long a[4]={0,0,0,0};
+    int i,max,mw;
+
+
+    if ((argc!=2)&&(argc!=3)) {
+	printf("Usage: %s <file name> [<max words>]\n",argv[0]);
+	exit(0);
+    }
+
+    if (argc==3) mw=atoi(argv[2]);
+    else mw=1;
+    
+    f=fopen(argv[1],"r");
+    if (!f) {
+	printf("Failed to open specified file. Check permissions!\n");
+	exit(1);
+    }
+    
+    while(!feof(f)) {
+	strcpy(phrase,"");
+	for (i=0;i<mw;i++) {
+	    if (i) strcat(phrase," ");
+	    fscanf(f,"%s",&word);
+	    for(st=0;word[st]=='"'||word[st]=='\''||word[st]=='(';st++);
+	    if (strlen(word)<1) continue;
+	    for(len=strlen(word)-1;word[len]==','||word[len]=='.'||word[len]=='!'||word[len]=='?'||word[len]==';'||word[len]=='-'||word[len]==':'||word[len]=='"'||word[len]=='\''||word[len]==')';len--);
+	    if (strlen(word)<1) continue;
+	    else word[len+1]=0;
+	    strcat(phrase,word+st);
+	}
+	if (strlen(phrase)<5) continue;
+
+	a[autocharset_russian_uc(phrase,strlen(phrase))]++;
+//	a[autocharset_russian(phrase,strlen(phrase))]++;
+
+//	a[autocharset_russian(word+st,len+1-st)]++;
+//	puts(word);
+    }
+
+    printf("Win: %lu, Koi: %lu, Alt: %lu, UTF: %lu\n",a[0],a[1],a[3],a[2]);
+    fclose(f);
+    if (a[0]>a[1]) {
+	if (a[0]>a[2]) max=0;
+	else max=2;
+    } else {
+	if (a[1]>a[2]) max=1;
+	else max=2;
+    }
+    if (a[3]>max) max=3;
+
+    f=fopen(argv[1],"r");
+    while(!feof(f)) {
+	strcpy(phrase,"");
+	for (i=0;i<mw;i++) {
+	    if (i) strcat(phrase," ");
+	    fscanf(f,"%s",&word);
+	    for(st=0;word[st]=='"'||word[st]=='\''||word[st]=='(';st++);
+	    if (strlen(word)<1) continue;
+	    for(len=strlen(word)-1;word[len]==','||word[len]=='.'||word[len]=='!'||word[len]=='?'||word[len]==';'||word[len]=='-'||word[len]==':'||word[len]=='"'||word[len]=='\''||word[len]==')';len--);
+	    if (strlen(word)<1) continue;
+	    else word[len+1]=0;
+	    strcat(phrase,word+st);
+	}
+	if (strlen(phrase)<5) continue;
+
+	i=autocharset_russian_uc(phrase,strlen(phrase));
+//	i=autocharset_russian(phrase,strlen(phrase));
+//	i=autocharset_russian(word+st,len+1-st);
+	if (i!=max) {
+	    if (i==0) printf("Win: %s\n",phrase);
+	    else if (i==1) printf("Koi: %s\n",phrase);
+	    else if (i==2) printf("UTF: %s\n",phrase);
+	    else if (i==3) printf("ALT: %s\n",phrase);
+	}
+    }
+    fclose(f);
+}
diff --git a/statgen/traslations b/statgen/traslations
new file mode 100755
index 0000000..630735a
--- /dev/null
+++ b/statgen/traslations
@@ -0,0 +1,12 @@
+#! /bin/bash
+
+if [ -z "$1" ]; then
+    echo "Usage: doit <file name>"
+    exit
+fi
+
+# In some CP866 texts used "Yo" and "N" simbols from CP1251 encoding. This fixes it.
+dos2unix -U $1
+cat $1 | sed -e "s/¸/ñ/g" | sed -e "s/¹/N/g;s/°/ø/g" | iconv -f CP866 -t KOI8-R > $1.koi
+cat $1 | sed -e "s/¸/ñ/g" | sed -e "s/¹/ü/g;s/°/ø/g" | iconv -f CP866 -t CP1251 > $1.win
+cat $1 | sed -e "s/¸/ñ/g" | sed -e "s/¹/ü/g;s/°/ø/g" | iconv -f CP866 -t UTF-8 > $1.utf
diff --git a/statgen/upper.c b/statgen/upper.c
new file mode 100644
index 0000000..be1a01c
--- /dev/null
+++ b/statgen/upper.c
@@ -0,0 +1,75 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+main(int argc, char *argv[]) {
+    FILE *f;
+    struct stat st;
+    unsigned char *text;
+    char locale[32];
+    int i;
+
+
+    if (argc!=3) {
+	printf("Usage: %s <file name> <encoding>\n",argv[0]);
+	exit(0);
+    }
+
+    if (strlen(argv[2])>12) {
+	printf("Invalid encoding(%s) specified!\n",argv[2]);
+	exit(1);
+    }
+    
+    if ((!strcasecmp(argv[2],"koi"))||(!strcasecmp(argv[2],"koi8"))||(!strcasecmp(argv[2],"koi-8"))||(!strcasecmp(argv[2],"koi8-r")))
+    	sprintf(locale,"ru_RU.%s","KOI8-R");
+    else if ((!strcasecmp(argv[2],"win"))||(!strcasecmp(argv[2],"cp1251"))||(!strcasecmp(argv[2],"cp-1251"))||(!strcasecmp(argv[2],"win1251"))||(!strcasecmp(argv[2],"win-1251")))
+	sprintf(locale,"ru_RU.%s","CP1251");
+    else
+	sprintf(locale,"ru_RU.%s",argv[2]);
+    if (!setlocale(LC_CTYPE,locale)) {
+	printf("Can't set locale %s!\n",argv[2]);
+	exit(1);
+    }
+    
+    if (stat(argv[1],&st)) {
+	printf("Specified file can't be stated!\n");
+	exit(1);
+    }
+    
+    if (!S_ISREG(st.st_mode)) {
+	printf("Specified file isn't regular file!\n");
+	exit(1);
+    }
+    
+    text=(unsigned char*)malloc(st.st_size);
+    if (!text) {
+	printf("Can't allocate %lu bytes of memory!\n",st.st_size);
+	exit(1);
+    }
+    
+    f=fopen(argv[1],"r");
+    if (!f) {
+	printf("Failed to open specified file. Check permissions!\n");
+	free(text);
+	exit(1);
+    }
+    if (fread(text,1,st.st_size,f)!=st.st_size) {
+	printf("Problem reading specified file!\n");
+	free(text);
+	fclose(f);
+	exit(1);
+    }
+    fclose(f);
+    
+    for (i=0;i<st.st_size;i++) 
+	text[i]=toupper(text[i]);
+    
+    f=fopen("UPPED.OUT","w");
+    fwrite(text,1,st.st_size,f);
+    fclose(f);
+    free(text);
+}
author	Suren A. Chilingaryan <csa@dside.dyndns.org>	2005-06-16 23:19:27 +0000
committer	Suren A. Chilingaryan <csa@dside.dyndns.org>	2005-06-16 23:19:27 +0000
commit	70fbe7822024d0acc68df3607ff25bf8d7a71751 (patch)
tree	553cd2ef8cfc936fc890113596db2c4478fe5163 /statgen
download	librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.gz librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.bz2 librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.xz librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.zip