#include "stdio.h" #include "stdint.h" #include "string.h" char *int2u8(unsigned U){ static char out[7]; int i=0; if(U<=0x80)out[i++]=U; else if(U<0x800){ out[i++]=(U>>6|0xC0); out[i++]=(U&0x3f|0x80); }else if(U<=0x10000){ out[i++]=(U>>12|0xE0); out[i++]=(U>>6&0x3f|0x80); out[i++]=(U&0x3f|0x80); }else if(U<=0x200000){ out[i++]=(U>>18|0xF0); out[i++]=(U>>12&0x3f|0x80); out[i++]=(U>>6&0x3f|0x80); out[i++]=(U&0x3f|0x80); }else if(U<=0x4000000){ out[i++]=(U>>24|0xF8); out[i++]=(U>>18&0x3f|0x80); out[i++]=(U>>12&0x3f|0x80); out[i++]=(U>>6&0x3f|0x80); out[i++]=(U&0x3f|0x80); }else if(U<=0x80000000){ out[i++]=(U>>30|0xFC); out[i++]=(U>>24&0x3f|0x80); out[i++]=(U>>18&0x3f|0x80); out[i++]=(U>>12&0x3f|0x80); out[i++]=(U>>6&0x3f|0x80); out[i++]=(U&0x3f|0x80); } out[i]=0; return out; } uint32_t read32(FILE *f){ uint32_t c; fread(&c,4,1,f); c = c >> 16 | c << 16; c = c >> 8 & 0xFF00FF | c << 8 & 0xFF00FF00; return c; } uint16_t read16(FILE *f){ uint16_t c; fread(&c,2,1,f); c = c >> 8 | c << 8; return c; } int u8charwid(int ch){ if(ch < ' ')return 0; if(ch < 0x300)return 1; if(ch < 0x370)return 0;// combining if(ch < 0x1100)return 1; if(ch < 0x1200)return 2; if(ch < 0x2E80)return 1; if(ch < 0xA000)return 2; if(ch < 0xAC00)return 1; if(ch < 0xD800)return 2; if(ch < 0xF900)return 1; if(ch < 0xFB00)return 2; if(ch < 0xFE00)return 1; if(ch < 0xFE10)return 0;// variation sel. if(ch < 0xFE20)return 2; if(ch < 0xFE30)return 0;// combining if(ch < 0xFE70)return 2; if(ch < 0xFF00)return 1; if(ch < 0xFF61)return 2;// fullwidth ascii if(ch < 0xFFE0)return 1;// hafwidth kana/jamo if(ch < 0xFFE7)return 2;// fullwidth symbols if(ch < 0x20000)return 1;// 2nd multilingual plane if(ch < 0x30000)return 2;// supplementary ideographic plane return 1; } int main(int argc, char **argv){ int a = 1; int onlyranges = 0; if(a >= argc)return (printf ("need a filename"),1); if(!strcmp(argv[a],"-r")){ onlyranges=1; a++; } if(a >= argc)return (printf("need a filename"),1); FILE *f = fopen(argv[a],"r+"); while(read32(f)!=0x636d6170); read32(f);//discard checksum, fuck that uint32_t cmap_loc = read32(f); //printf("cmap is at %x\n",cmap_loc); fseek(f,cmap_loc,SEEK_SET); read32(f);//discard table header while(read32(f)!=0x3000A)//skip to UCS-4 table read32(f);//skip... uint32_t ucs4_loc = cmap_loc + read32(f); //printf("ucs4 is at %x\n",ucs4_loc); fseek(f,ucs4_loc,SEEK_SET); if(read16(f)!=12)return(printf("ucs4 table not format 12"),1); fseek(f,ucs4_loc + 12,SEEK_SET); uint32_t i,n_ranges = read32(f); uint32_t n = 0; int countkanji = 0; int counthangul = 0; int total = 0; n = 0; for(i=0;i<n_ranges;i++){ uint32_t sta,end; sta = read32(f); end = read32(f); total += end - sta + 1; read32(f);//ignore glph id. if(!onlyranges)while(sta <= end){ fputs(int2u8(sta),stdout); if(sta >= 0x4E00 && sta < 0xA000)countkanji++; if(sta >= 0xAC00 && sta < 0xD7B0)counthangul++; n += u8charwid(sta); sta++; if( n >= 64 ){putchar('\n');n=0;} }else{ printf("range %d U%x U%x\n",i,sta,end); } } printf("\nthere are %d characters supported\n",total); printf("there are %d hangul precomposed characters supported\n",counthangul); printf("there are %d non-hangul characters supported\n",total-counthangul); printf("there are %d chinese characters supported\n",countkanji); return 0; }