source: libcaca/trunk/caca/charset.c @ 2821

Last change on this file since 2821 was 2821, checked in by Sam Hocevar, 11 years ago

Starting refactoring to get rid of libcucul. The initial reason for the
split is rendered moot by the plugin system: when enabled, binaries do
not link directly with libX11 or libGL. I hope this is a step towards
more consisteny and clarity.

  • Property svn:keywords set to Id
File size: 12.3 KB
Line 
1/*
2 *  libcaca       Colour ASCII-Art library
3 *  Copyright (c) 2002-2007 Sam Hocevar <sam@zoy.org>
4 *                2007 Ben Wiley Sittler <bsittler@gmail.com>
5 *                All Rights Reserved
6 *
7 *  $Id: charset.c 2821 2008-09-27 13:12:46Z sam $
8 *
9 *  This library is free software. It comes without any warranty, to
10 *  the extent permitted by applicable law. You can redistribute it
11 *  and/or modify it under the terms of the Do What The Fuck You Want
12 *  To Public License, Version 2, as published by Sam Hocevar. See
13 *  http://sam.zoy.org/wtfpl/COPYING for more details.
14 */
15
16/*
17 *  This file contains functions for converting characters between
18 *  various character sets.
19 */
20
21#include "config.h"
22
23#if !defined(__KERNEL__)
24#   include <string.h>
25#endif
26
27#include "caca.h"
28#include "caca_internals.h"
29
30/*
31 * UTF-8 handling
32 */
33
34static uint8_t const trailing[256] =
35{
36    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
44};
45
46static uint32_t const offsets[6] =
47{
48    0x00000000UL, 0x00003080UL, 0x000E2080UL,
49    0x03C82080UL, 0xFA082080UL, 0x82082080UL
50};
51
52/*
53 * CP437 handling
54 */
55
56static uint32_t const cp437_lookup1[] =
57{
58    /* 0x01 - 0x0f: ☺ ☻ ♥ ♦ ♣ ♠ • ◘ ○ ◙ ♂ ♀ ♪ ♫ ☼ */
59            0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
60    0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c,
61    /* 0x10 - 0x1f: ► ◄ ↕ ‼ ¶ § ▬ ↨ ↑ ↓ → ← ∟ ↔ ▲ ▼ */
62    0x25ba, 0x25c4, 0x2195, 0x203c, 0xb6, 0xa7, 0x25ac, 0x21a8,
63    0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc
64};
65
66static uint32_t const cp437_lookup2[] =
67{
68    /* 0x7f: ⌂ */
69    0x2302,
70    /* 0x80 - 0x8f: Ç ü é â ä à å ç ê ë è ï î ì Ä Å */
71    0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
72    0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
73    /* 0x90 - 0x9f: É æ Æ ô ö ò û ù ÿ Ö Ü ¢ £ ¥ ₧ ƒ */
74    0xc9, 0xe6, 0xc6, 0xf4, 0xf6, 0xf2, 0xfb, 0xf9,
75    0xff, 0xd6, 0xdc, 0xa2, 0xa3, 0xa5, 0x20a7, 0x192,
76    /* 0xa0 - 0xaf: á í ó ú ñ Ñ ª º ¿ ⌐ ¬ ½ ¼ ¡ « » */
77    0xe1, 0xed, 0xf3, 0xfa, 0xf1, 0xd1, 0xaa, 0xba,
78    0xbf, 0x2310, 0xac, 0xbd, 0xbc, 0xa1, 0xab, 0xbb,
79    /* 0xb0 - 0xbf: ░ ▒ ▓ │ ┤ ╡ ╢ ╖ ╕ ╣ ║ ╗ ╝ ╜ ╛ ┐ */
80    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
81    0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
82    /* 0xc0 - 0xcf: └ ┴ ┬ ├ ─ ┼ ╞ ╟ ╚ ╔ ╩ ╦ ╠ ═ ╬ ╧ */
83    0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
84    0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
85    /* 0xd0 - 0xdf: ╨ ╤ ╥ ╙ ╘ ╒ ╓ ╫ ╪ ┘ ┌ █ ▄ ▌ ▐ ▀ */
86    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,
87    0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
88    /* 0xe0 - 0xef: α ß Γ π Σ σ µ τ Φ Θ Ω δ ∞ φ ε ∩ */
89    0x3b1, 0xdf, 0x393, 0x3c0, 0x3a3, 0x3c3, 0xb5, 0x3c4,
90    0x3a6, 0x398, 0x3a9, 0x3b4, 0x221e, 0x3c6, 0x3b5, 0x2229,
91    /* 0xf0 - 0xff: ≡ ± ≥ ≤ ⌠ ⌡ ÷ ≈ ° ∙ · √ ⁿ ² ■ <nbsp> */
92    0x2261, 0xb1, 0x2265, 0x2264, 0x2320, 0x2321, 0xf7, 0x2248,
93    0xb0, 0x2219, 0xb7, 0x221a, 0x207f, 0xb2, 0x25a0, 0xa0
94};
95
96/** \brief Convert a UTF-8 character to UTF-32.
97 *
98 *  Convert a UTF-8 character read from a string and return its value in
99 *  the UTF-32 character set. If the second argument is not null, the total
100 *  number of read bytes is written in it.
101 *
102 *  If a null byte was reached before the expected end of the UTF-8 sequence,
103 *  this function returns zero and the number of read bytes is set to zero.
104 *
105 *  This function never fails, but its behaviour with illegal UTF-8 sequences
106 *  is undefined.
107 *
108 *  \param s A string containing the UTF-8 character.
109 *  \param bytes A pointer to a size_t to store the number of bytes in the
110 *         character, or NULL.
111 *  \return The corresponding UTF-32 character, or zero if the character
112 *  is incomplete.
113 */
114uint32_t caca_utf8_to_utf32(char const *s, size_t *bytes)
115{
116    int todo = trailing[(int)(unsigned char)*s];
117    int i = 0;
118    uint32_t ret = 0;
119
120    for(;;)
121    {
122        if(!*s)
123        {
124            if(bytes)
125                *bytes = 0;
126            return 0;
127        }
128
129        ret += ((uint32_t)(unsigned char)*s++) << (6 * (todo - i));
130
131        if(todo == i++)
132        {
133            if(bytes)
134                *bytes = i;
135            return ret - offsets[todo];
136        }
137    }
138}
139
140/** \brief Convert a UTF-32 character to UTF-8.
141 *
142 *  Convert a UTF-32 character read from a string and write its value in
143 *  the UTF-8 character set into the given buffer.
144 *
145 *  This function never fails, but its behaviour with illegal UTF-32 characters
146 *  is undefined.
147 *
148 *  \param buf A pointer to a character buffer where the UTF-8 sequence will
149 *  be written.
150 *  \param ch The UTF-32 character.
151 *  \return The number of bytes written.
152 */
153size_t caca_utf32_to_utf8(char *buf, uint32_t ch)
154{
155    static const uint8_t mark[7] =
156    {
157        0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
158    };
159
160    char *parser = buf;
161    size_t bytes;
162
163    if(ch < 0x80)
164    {
165        *parser++ = ch;
166        return 1;
167    }
168
169    bytes = (ch < 0x800) ? 2 : (ch < 0x10000) ? 3 : 4;
170    parser += bytes;
171
172    switch(bytes)
173    {
174        case 4: *--parser = (ch | 0x80) & 0xbf; ch >>= 6;
175        case 3: *--parser = (ch | 0x80) & 0xbf; ch >>= 6;
176        case 2: *--parser = (ch | 0x80) & 0xbf; ch >>= 6;
177    }
178    *--parser = ch | mark[bytes];
179
180    return bytes;
181}
182
183/** \brief Convert a UTF-32 character to CP437.
184 *
185 *  Convert a UTF-32 character read from a string and return its value in
186 *  the CP437 character set, or "?" if the character has no equivalent.
187 *
188 *  This function never fails.
189 *
190 *  \param ch The UTF-32 character.
191 *  \return The corresponding CP437 character, or "?" if not representable.
192 */
193uint8_t caca_utf32_to_cp437(uint32_t ch)
194{
195    unsigned int i;
196
197    if(ch < 0x00000020)
198        return '?';
199
200    if(ch < 0x00000080)
201        return ch;
202
203    for(i = 0; i < sizeof(cp437_lookup1) / sizeof(*cp437_lookup1); i++)
204        if(cp437_lookup1[i] == ch)
205            return 0x01 + i;
206
207    for(i = 0; i < sizeof(cp437_lookup2) / sizeof(*cp437_lookup2); i++)
208        if(cp437_lookup2[i] == ch)
209            return 0x7f + i;
210
211    return '?';
212}
213
214/** \brief Convert a CP437 character to UTF-32.
215 *
216 *  Convert a CP437 character read from a string and return its value in
217 *  the UTF-32 character set, or zero if the character is a CP437 control
218 *  character.
219 *
220 *  This function never fails.
221 *
222 *  \param ch The CP437 character.
223 *  \return The corresponding UTF-32 character, or zero if not representable.
224 */
225uint32_t caca_cp437_to_utf32(uint8_t ch)
226{
227    if(ch > 0x7f)
228        return cp437_lookup2[ch - 0x7f];
229
230    if(ch >= 0x20)
231        return (uint32_t)ch;
232
233    if(ch > 0)
234        return cp437_lookup1[ch - 0x01];
235
236    return 0x00000000;
237}
238
239/** \brief Convert a UTF-32 character to ASCII.
240 *
241 *  Convert a UTF-32 character into an ASCII character. When no equivalent
242 *  exists, a graphically close equivalent is sought.
243 *
244 *  This function never fails, but its behaviour with illegal UTF-32 characters
245 *  is undefined.
246 *
247 *  \param ch The UTF-32 character.
248 *  \return The corresponding ASCII character, or a graphically close
249 *  equivalent if found, or "?" if not representable.
250 */
251char caca_utf32_to_ascii(uint32_t ch)
252{
253    /* Standard ASCII */
254    if(ch < 0x80)
255        return ch;
256
257    /* Fullwidth Forms */
258    if(ch > 0x0000ff00 && ch < 0x0000ff5f)
259        return ' ' + (ch - 0x0000ff00);
260
261    switch (ch)
262    {
263    case 0x000000a0: /*   (nbsp) */
264    case 0x00003000: /*   (ideographic space) */
265        return ' ';
266    case 0x000000a3: /* £ */
267        return 'f';
268    case 0x000000b0: /* ° */
269        return '\'';
270    case 0x000000b1: /* ± */
271        return '#';
272    case 0x000000b7: /* · */
273    case 0x00002219: /* ∙ */
274    case 0x000030fb: /* ・ */
275        return '.';
276    case 0x000003c0: /* π */
277        return '*';
278    case 0x00002018: /* ‘ */
279    case 0x00002019: /* ’ */
280        return '\'';
281    case 0x0000201c: /* “ */
282    case 0x0000201d: /* ” */
283        return '"';
284    case 0x00002190: /* ← */
285        return '<';
286    case 0x00002191: /* ↑ */
287        return '^';
288    case 0x00002192: /* → */
289        return '>';
290    case 0x00002193: /* ↓ */
291        return 'v';
292    case 0x00002260: /* ≠ */
293        return '!';
294    case 0x00002261: /* ≡ */
295        return '=';
296    case 0x00002264: /* ≤ */
297        return '<';
298    case 0x00002265: /* ≥ */
299        return '>';
300    case 0x000023ba: /* ⎺ */
301    case 0x000023bb: /* ⎻ */
302    case 0x000023bc: /* ⎼ */
303    case 0x000023bd: /* ⎽ */
304    case 0x00002500: /* ─ */
305    case 0x00002550: /* ═ */
306        return '-';
307    case 0x00002502: /* │ */
308    case 0x00002551: /* ║ */
309        return '|';
310    case 0x0000250c: /* ┌ */
311    case 0x00002552: /* ╒ */
312    case 0x00002553: /* ╓ */
313    case 0x00002554: /* ╔ */
314    case 0x00002514: /* └ */
315    case 0x00002558: /* ╘ */
316    case 0x00002559: /* ╙ */
317    case 0x0000255a: /* ╚ */
318    case 0x0000251c: /* ├ */
319    case 0x0000255e: /* ╞ */
320    case 0x0000255f: /* ╟ */
321    case 0x00002560: /* ╠ */
322    case 0x0000252c: /* ┬ */
323    case 0x00002564: /* ╤ */
324    case 0x00002565: /* ╥ */
325    case 0x00002566: /* ╦ */
326    case 0x00002534: /* ┴ */
327    case 0x00002567: /* ╧ */
328    case 0x00002568: /* ╨ */
329    case 0x00002569: /* ╩ */
330    case 0x0000253c: /* ┼ */
331    case 0x0000256a: /* ╪ */
332    case 0x0000256b: /* ╫ */
333    case 0x0000256c: /* ╬ */
334        return '+';
335    case 0x00002510: /* ┐ */
336    case 0x00002555: /* ╕ */
337    case 0x00002556: /* ╖ */
338    case 0x00002557: /* ╗ */
339    case 0x00002518: /* ┘ */
340    case 0x0000255b: /* ╛ */
341    case 0x0000255c: /* ╜ */
342    case 0x0000255d: /* ╝ */
343    case 0x00002524: /* ┤ */
344    case 0x00002561: /* ╡ */
345    case 0x00002562: /* ╢ */
346    case 0x00002563: /* ╣ */
347        return '+';
348    case 0x00002591: /* ░ */
349    case 0x00002592: /* ▒ */
350    case 0x00002593: /* ▓ */
351    case 0x00002588: /* █ */
352    case 0x0000258c: /* ▌ */
353    case 0x00002590: /* ▐ */
354    case 0x000025a0: /* ■ */
355    case 0x000025ac: /* ▬ */
356    case 0x000025ae: /* ▮ */
357        return '#';
358    case 0x00002580: /* ▀ */
359        return '"';
360    case 0x00002584: /* ▄ */
361        return ',';
362    case 0x000025c6: /* ◆ */
363    case 0x00002666: /* ♦ */
364        return '+';
365    case 0x00002022: /* • */
366    case 0x000025cb: /* ○ */
367    case 0x000025cf: /* ● */
368    case 0x00002603: /* ☃ */
369    case 0x0000263c: /* ☼ */
370        return 'o';
371    case 0x0000301c: /* 〜 */
372        return '~';
373    }
374
375    return '?';
376}
377
378/** \brief Tell whether a UTF-32 character is fullwidth.
379 *
380 *  Check whether the given UTF-32 character should be printed at twice
381 *  the normal width (fullwidth characters). If the character is unknown
382 *  or if its status cannot be decided, it is treated as a standard-width
383 *  character.
384 *
385 *  This function never fails.
386 *
387 *  \param ch The UTF-32 character.
388 *  \return 1 if the character is fullwidth, 0 otherwise.
389 */
390int caca_utf32_is_fullwidth(uint32_t ch)
391{
392    if(ch < 0x2e80) /* Standard stuff */
393        return 0;
394    if(ch < 0xa700) /* Japanese, Korean, CJK, Yi... */
395        return 1;
396    if(ch < 0xac00) /* Modified Tone Letters, Syloti Nagri */
397        return 0;
398    if(ch < 0xd800) /* Hangul Syllables */
399        return 1;
400    if(ch < 0xf900) /* Misc crap */
401        return 0;
402    if(ch < 0xfb00) /* More CJK */
403        return 1;
404    if(ch < 0xfe20) /* Misc crap */
405        return 0;
406    if(ch < 0xfe70) /* More CJK */
407        return 1;
408    if(ch < 0xff00) /* Misc crap */
409        return 0;
410    if(ch < 0xff61) /* Fullwidth forms */
411        return 1;
412    if(ch < 0xffe0) /* Halfwidth forms */
413        return 0;
414    if(ch < 0xffe8) /* More fullwidth forms */
415        return 1;
416    if(ch < 0x20000) /* Misc crap */
417        return 0;
418    if(ch < 0xe0000) /* More CJK */
419        return 1;
420    return 0;
421}
422
Note: See TracBrowser for help on using the repository browser.