Changeset 3526


Ignore:
Timestamp:
May 27, 2009 7:04:10 AM (5 years ago)
Author:
sam
Message:

Add multiple charsets support to img2twit, and autodetect charset when
decoding.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • libpipi/trunk/examples/img2twit.cpp

    r3525 r3526  
    2727
    2828/*
    29  * User-definable settings.
     29 * Format-dependent settings. Change this and you risk making all other
     30 * generated strings unusable.
    3031 */
    3132
    32 /* The Unicode characters at disposal - XXX: must be _ordered_ */
    33 static const uint32_t unichars[] =
    34 {
    35     /* Printable ASCII (except space) */
    36     //0x0021, 0x007f,
    37 
    38     /* Stupid symbols and Dingbats shit */
    39     //0x25a0, 0x2600, /* Geometric Shapes */
    40     //0x2600, 0x269e, 0x26a0, 0x26bd, 0x26c0, 0x26c4, /* Misc. Symbols */
    41     //0x2701, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728, 0x2729, 0x274c,
    42     //  0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x275f,
    43     //  0x2761, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf, /* Dingbats */
    44 
    45     /* Chinese-looking stuff */
    46     //0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, /* CJK Radicals Supplement */
    47     //0x2f00, 0x2fd6, /* Kangxi Radicals */
    48     //0x3400, 0x4db6, /* CJK Unified Ideographs Extension A */
    49     0x4e00, 0x9fa6, /* CJK Unified Ideographs */
    50 
    51     /* Korean - most people don't know the difference anyway */
    52     //0xac00, 0xd7a4, /* Hangul Syllables */
    53 
    54     /* More Chinese */
    55     //0xf900, 0xfa2e, 0xfa30, 0xfa6b, 0xfa70, 0xfada, /* CJK Compat. Idgphs. */
    56 
    57     /* TODO: there's also the U+20000 and U+2f800 planes, but they're
    58      * not supported by the Twitter Javascript filter (yet?). */
    59 
    60     /* End of list marker - XXX: don't remove! */
    61     0x0000, 0x0000
    62 };
     33/* Printable ASCII (except space) */
     34#define RANGE_ASCII 0x0021, 0x007f
     35
     36/* CJK Unified Ideographs */
     37#define RANGE_CJK 0x4e00, 0x9fa6
     38//0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, /* CJK Radicals Supplement */
     39//0x2f00, 0x2fd6, /* Kangxi Radicals */
     40//0x3400, 0x4db6, /* CJK Unified Ideographs Extension A */
     41//0xac00, 0xd7a4, /* Hangul Syllables -- Korean, not Chinese */
     42//0xf900, 0xfa2e, 0xfa30, 0xfa6b, 0xfa70, 0xfada, /* CJK Compat. Idgphs. */
     43/* TODO: there's also the U+20000 and U+2f800 planes, but they're
     44 * not supported by the Twitter Javascript filter (yet?). */
     45
     46/* Stupid symbols and Dingbats shit */
     47#define RANGE_SYMBOLS 0x25a0, 0x2600, /* Geometric Shapes */ \
     48  0x2600, 0x269e, 0x26a0, 0x26bd, 0x26c0, 0x26c4, /* Misc. Symbols */ \
     49  0x2701, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728, 0x2729, 0x274c, \
     50    0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x275f, \
     51    0x2761, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf /* Dingbats */
     52
     53/* End of list marker */
     54#define RANGE_END 0x0, 0x0
     55
     56/* Pre-defined character ranges XXX: must be _ordered_ */
     57static const uint32_t unichars_ascii[] = { RANGE_ASCII, RANGE_END };
     58static const uint32_t unichars_cjk[] = { RANGE_CJK, RANGE_END };
     59static const uint32_t unichars_symbols[] = { RANGE_SYMBOLS, RANGE_END };
     60
     61/* The Unicode characters at disposal */
     62static const uint32_t *unichars;
    6363
    6464/* The maximum image size we want to support */
     
    795795int main(int argc, char *argv[])
    796796{
     797    uint32_t unicode_data[4096]; /* FIXME: allocate this dynamically */
    797798    int opstats[2 * NB_OPS];
    798799    char const *srcname = NULL, *dstname = NULL;
    799800    pipi_image_t *src, *tmp, *dst;
    800801    double error = 1.0;
    801     int width, height, ret = 0;
     802    int width, height;
    802803
    803804    /* Parse command-line options */
     
    809810            { "output",      1, NULL, 'o' },
    810811            { "length",      1, NULL, 'l' },
     812            { "charset",     1, NULL, 'c' },
    811813            { "quality",     1, NULL, 'q' },
    812814            { "debug",       0, NULL, 'd' },
     
    814816            { NULL,          0, NULL, 0   },
    815817        };
    816         int c = mygetopt(argc, argv, "o:l:q:dh", long_options, &option_index);
     818        int c = mygetopt(argc, argv, "o:l:c:q:dh", long_options, &option_index);
    817819
    818820        if(c == -1)
     
    830832                fprintf(stderr, "Warning: rounding minimum message length to 16\n");
    831833                MAX_MSG_LEN = 16;
     834            }
     835            break;
     836        case 'c':
     837            if(!strcmp(myoptarg, "ascii"))
     838                unichars = unichars_ascii;
     839            else if(!strcmp(myoptarg, "cjk"))
     840                unichars = unichars_cjk;
     841            else if(!strcmp(myoptarg, "symbols"))
     842                unichars = unichars_symbols;
     843            else
     844            {
     845                fprintf(stderr, "Error: invalid char block \"%s\".", myoptarg);
     846                fprintf(stderr, "Valid sets are: ascii, cjk, symbols\n");
     847                return EXIT_FAILURE;
    832848            }
    833849            break;
     
    850866            printf("  -o, --output <filename>   output resulting image to filename\n");
    851867            printf("  -l, --length <size>       message length in characters (default 140)\n");
     868            printf("  -c, --charset <block>     character set to use (ascii, [cjk], symbols)\n");
    852869            printf("  -q, --quality <rate>      set image quality (0 - 10) (default 5)\n");
    853870            printf("  -d, --debug               print debug information\n");
     
    879896    if(myoptind == argc - 1)
    880897        srcname = argv[myoptind];
     898
     899    /* Decoding mode: read UTF-8 text from stdin */
     900    if(dstname)
     901        for(int i = 0; i < MAX_MSG_LEN; i++)
     902            unicode_data[i] = fread_utf8(stdin);
     903
     904    /* Autodetect charset if decoding, otherwise switch to CJK. */
     905    if(!unichars)
     906    {
     907        if(dstname)
     908        {
     909            if(unicode_data[0] >= 0x0021 && unicode_data[0] < 0x007f)
     910                unichars = unichars_ascii;
     911            else if(unicode_data[0] >= 0x4e00 && unicode_data[0] < 0x9fa6)
     912                unichars = unichars_cjk;
     913            else if(unicode_data[0] >= 0x25a0 && unicode_data[0] < 0x27bf)
     914                unichars = unichars_symbols;
     915            else
     916            {
     917                fprintf(stderr, "Error: unable to detect charset\n");
     918                return EXIT_FAILURE;
     919            }
     920        }
     921        else
     922            unichars = unichars_cjk;
     923    }
    881924
    882925    pipi_set_gamma(1.0);
     
    904947    if(dstname)
    905948    {
    906         /* Decoding mode: read UTF-8 text from stdin, find each
    907          * character's index in our character list, and push it to our
    908          * wonderful custom bitstream. */
    909         uint32_t data[MAX_MSG_LEN];
    910         for(int i = 0; i < MAX_MSG_LEN; i++)
    911             data[i] = uni2index(fread_utf8(stdin));
     949        /* Decoding mode: find each character's index in our character
     950         * list, and push it to our wonderful custom bitstream. */
    912951        for(int i = MAX_MSG_LEN; i--; )
    913             b.push(data[i], NUM_CHARACTERS);
     952            b.push(uni2index(unicode_data[i]), NUM_CHARACTERS);
    914953
    915954        /* Read width and height from bitstream */
     
    933972    }
    934973
    935     /* Compute best w/h ratio */
     974    /* Compute "best" w/h ratio */
    936975    dw = 1; dh = TOTAL_CELLS;
    937976    for(unsigned int i = 1; i <= TOTAL_CELLS; i++)
     
    11611200    }
    11621201
    1163     return ret;
    1164 }
    1165 
     1202    return EXIT_SUCCESS;
     1203}
     1204
Note: See TracChangeset for help on using the changeset viewer.