Grcov report - cairo-unicode.c

1

/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */

2

/* cairo - a vector graphics library with display and print output

3

4

 * The code in this file is derived from GLib's gutf8.c and

5

 *   ultimately from libunicode. It is relicensed under the

6

 *   dual LGPL/MPL with permission of the original authors.

7

8

 * Copyright © 1999 Tom Tromey

9

 * Copyright © 2005 Red Hat, Inc

10

11

 * This library is free software; you can redistribute it and/or

12

 * modify it either under the terms of the GNU Lesser General Public

13

 * License version 2.1 as published by the Free Software Foundation

14

 * (the "LGPL") or, at your option, under the terms of the Mozilla

15

 * Public License Version 1.1 (the "MPL"). If you do not alter this

16

 * notice, a recipient may use your version of this file under either

17

 * the MPL or the LGPL.

18

19

 * You should have received a copy of the LGPL along with this library

20

 * in the file COPYING-LGPL-2.1; if not, write to the Free Software

21

 * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA

22

 * You should have received a copy of the MPL along with this library

23

 * in the file COPYING-MPL-1.1

24

25

 * The contents of this file are subject to the Mozilla Public License

26

 * Version 1.1 (the "License"); you may not use this file except in

27

 * compliance with the License. You may obtain a copy of the License at

28

 * http://www.mozilla.org/MPL/

29

30

 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY

31

 * OF ANY KIND, either express or implied. See the LGPL or the MPL for

32

 * the specific language governing rights and limitations.

33

34

 * The Original Code is the cairo graphics library.

35

36

 * The Initial Developer of the Original Code is Tom Tromey.

37

 *  and Red Hat, Inc.

38

39

 * Contributor(s):

40

 *	Owen Taylor <otaylor@redhat.com>

41

*/

42

43

#include "cairoint.h"

44

#include "cairo-error-private.h"

45

46

#define UTF8_COMPUTE(Char, Mask, Len)					      \

47

  if (Char < 128)							      \

48

    {									      \

49

      Len = 1;								      \

50

      Mask = 0x7f;							      \

51

    }									      \

52

  else if ((Char & 0xe0) == 0xc0)					      \

53

    {									      \

54

      Len = 2;								      \

55

      Mask = 0x1f;							      \

56

    }									      \

57

  else if ((Char & 0xf0) == 0xe0)					      \

58

    {									      \

59

      Len = 3;								      \

60

      Mask = 0x0f;							      \

61

    }									      \

62

  else if ((Char & 0xf8) == 0xf0)					      \

63

    {									      \

64

      Len = 4;								      \

65

      Mask = 0x07;							      \

66

    }									      \

67

  else if ((Char & 0xfc) == 0xf8)					      \

68

    {									      \

69

      Len = 5;								      \

70

      Mask = 0x03;							      \

71

    }									      \

72

  else if ((Char & 0xfe) == 0xfc)					      \

73

    {									      \

74

      Len = 6;								      \

75

      Mask = 0x01;							      \

76

    }									      \

77

  else									      \

78

    Len = -1;

79

80

#define UTF8_LENGTH(Char)              \

81

  ((Char) < 0x80 ? 1 :                 \

82

   ((Char) < 0x800 ? 2 :               \

83

    ((Char) < 0x10000 ? 3 :            \

84

     ((Char) < 0x200000 ? 4 :          \

85

      ((Char) < 0x4000000 ? 5 : 6)))))

86

87

#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \

88

  (Result) = (Chars)[0] & (Mask);					      \

89

  for ((Count) = 1; (Count) < (Len); ++(Count))				      \

90

    {									      \

91

      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \

92

	{								      \

93

	  (Result) = -1;						      \

94

	  break;							      \

95

	}								      \

96

      (Result) <<= 6;							      \

97

      (Result) |= ((Chars)[(Count)] & 0x3f);				      \

98

99

100

#define UNICODE_VALID(Char)                   \

101

    ((Char) < 0x110000 &&                     \

102

     (((Char) & 0xFFFFF800) != 0xD800))

103

104

static const char utf8_skip_data[256] = {

105

    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

106

    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

107

    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

108

    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

109

    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

110

    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

111

    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,

112

    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1

113

};

114

115

#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])

116

117

/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.

118

 * If @p does not point to a valid UTF-8 encoded character, results are

119

 * undefined.

120

**/

121

static uint32_t

122

_utf8_get_char (const unsigned char *p)

123

124

    int i, mask = 0, len;

125

    uint32_t result;

126

    unsigned char c = (unsigned char) *p;

127

128

    UTF8_COMPUTE (c, mask, len);

129

    if (len == -1)

130

	return (uint32_t)-1;

131

    UTF8_GET (result, p, i, mask, len);

132

133

    return result;

134

135

136

/* Like _utf8_get_char, but take a maximum length

137

 * and return (uint32_t)-2 on incomplete trailing character

138

*/

139

static uint32_t

140

392488

_utf8_get_char_extended (const unsigned char *p,

141

			 long		      max_len)

142

143

    int i, len;

144

392488

    uint32_t wc = (unsigned char) *p;

145

146

392488

    if (wc < 0x80) {

147

392472

	return wc;

148

16

    } else if (wc < 0xc0) {

149

	return (uint32_t)-1;

150

16

    } else if (wc < 0xe0) {

151

6

	len = 2;

152

6

	wc &= 0x1f;

153

10

    } else if (wc < 0xf0) {

154

	len = 3;

155

	wc &= 0x0f;

156

10

    } else if (wc < 0xf8) {

157

9

	len = 4;

158

9

	wc &= 0x07;

159

1

    } else if (wc < 0xfc) {

160

	len = 5;

161

	wc &= 0x03;

162

1

    } else if (wc < 0xfe) {

163

	len = 6;

164

	wc &= 0x01;

165

    } else {

166

1

	return (uint32_t)-1;

167

168

169

15

    if (max_len >= 0 && len > max_len) {

170

	for (i = 1; i < max_len; i++) {

171

	    if ((((unsigned char *)p)[i] & 0xc0) != 0x80)

172

		return (uint32_t)-1;

173

174

	return (uint32_t)-2;

175

176

177

48

    for (i = 1; i < len; ++i) {

178

33

	uint32_t ch = ((unsigned char *)p)[i];

179

180

33

	if ((ch & 0xc0) != 0x80) {

181

	    if (ch)

182

		return (uint32_t)-1;

183

	    else

184

		return (uint32_t)-2;

185

186

187

33

	wc <<= 6;

188

33

	wc |= (ch & 0x3f);

189

190

191

15

    if (UTF8_LENGTH(wc) != len)

192

	return (uint32_t)-1;

193

194

15

    return wc;

195

196

197

/**

198

 * _cairo_utf8_get_char_validated:

199

 * @p: a UTF-8 string

200

 * @unicode: location to store one Unicode character

201

202

 * Decodes the first character of a valid UTF-8 string, and returns

203

 * the number of bytes consumed.

204

205

 * Note that the string should be valid.  Do not use this without

206

 * validating the string first.

207

208

 * Returns: the number of bytes forming the character returned.

209

**/

210

int

211

356964

_cairo_utf8_get_char_validated (const char *p,

212

				uint32_t   *unicode)

213

214

356964

    int i, mask = 0, len;

215

    uint32_t result;

216

356964

    unsigned char c = (unsigned char) *p;

217

218

356964

    UTF8_COMPUTE (c, mask, len);

219

356964

    if (len == -1) {

220

	if (unicode)

221

	    *unicode = (uint32_t)-1;

222

	return 1;

223

224

356997

    UTF8_GET (result, p, i, mask, len);

225

226

356964

    if (unicode)

227

356964

	*unicode = result;

228

356964

    return len;

229

230

231

/**

232

 * _cairo_utf8_to_ucs4:

233

 * @str: an UTF-8 string

234

 * @len: length of @str in bytes, or -1 if it is nul-terminated.

235

 *   If @len is supplied and the string has an embedded nul

236

 *   byte, only the portion before the nul byte is converted.

237

 * @result: location to store a pointer to a newly allocated UTF-32

238

 *   string (always native endian), or %NULL. Free with free(). A 0

239

 *   word will be written after the last character.

240

 * @items_written: location to store number of 32-bit words

241

 *   written. (Not including the trailing 0)

242

243

 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode

244

 * with 1 32-bit word per character. The string is validated to

245

 * consist entirely of valid Unicode characters.

246

247

 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was

248

 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an

249

 *   invalid sequence was found.

250

**/

251

cairo_status_t

252

71216

_cairo_utf8_to_ucs4 (const char *str,

253

		     int	 len,

254

		     uint32_t  **result,

255

		     int	*items_written)

256

257

71216

    uint32_t *str32 = NULL;

258

    int n_chars, i;

259

    const unsigned char *in;

260

71216

    const unsigned char * const ustr = (const unsigned char *) str;

261

262

71216

    in = ustr;

263

71216

    n_chars = 0;

264

463703

    while ((len < 0 || ustr + len - in > 0) && *in)

265

266

392488

	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);

267

392488

	if (wc & 0x80000000 || !UNICODE_VALID (wc))

268

1

	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);

269

270

392487

	n_chars++;

271

392487

	if (n_chars == INT_MAX)

272

	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);

273

274

392487

	in = UTF8_NEXT_CHAR (in);

275

276

277

71215

    if (result) {

278

	str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));

279

	if (!str32)

280

	    return _cairo_error (CAIRO_STATUS_NO_MEMORY);

281

282

	in = ustr;

283

	for (i=0; i < n_chars; i++) {

284

	    str32[i] = _utf8_get_char (in);

285

	    in = UTF8_NEXT_CHAR (in);

286

287

	str32[i] = 0;

288

289

	*result = str32;

290

291

292

71215

    if (items_written)

293

67995

	*items_written = n_chars;

294

295

71215

    return CAIRO_STATUS_SUCCESS;

296

297

298

/**

299

 * _cairo_ucs4_to_utf8:

300

 * @unicode: a UCS-4 character

301

 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes

302

 * space available. Or %NULL.

303

304

 * This space left intentionally blank.

305

306

 * Return value: Number of bytes in the utf8 string or 0 if an invalid

307

 * unicode character

308

**/

309

int

310

_cairo_ucs4_to_utf8 (uint32_t  unicode,

311

		     char     *utf8)

312

313

    int bytes;

314

    char *p;

315

316

    if (unicode < 0x80) {

317

	if (utf8)

318

	    *utf8 = unicode;

319

	return 1;

320

    } else if (unicode < 0x800) {

321

	bytes = 2;

322

    } else if (unicode < 0x10000) {

323

	bytes = 3;

324

    } else if (unicode < 0x200000) {

325

	bytes = 4;

326

    } else {

327

	return 0;

328

329

330

    if (!utf8)

331

	return bytes;

332

333

    p = utf8 + bytes;

334

    while (p > utf8) {

335

	*--p = 0x80 | (unicode & 0x3f);

336

	unicode >>= 6;

337

338

    *p |= 0xf0 << (4 - bytes);

339

340

    return bytes;

341

342

343

/**

344

 * _cairo_ucs4_to_utf16:

345

 * @unicode: a UCS-4 character

346

 * @utf16: buffer to write utf16 string into. Must have at least 2

347

 * elements. Or %NULL.

348

349

 * This space left intentionally blank.

350

351

 * Return value: Number of elements in the utf16 string or 0 if an

352

 * invalid unicode character

353

**/

354

int

355

_cairo_ucs4_to_utf16 (uint32_t  unicode,

356

		      uint16_t *utf16)

357

358

    if (unicode < 0x10000) {

359

	if (utf16)

360

	    utf16[0] = unicode;

361

	return 1;

362

    } else if (unicode < 0x110000) {

363

	if (utf16) {

364

	    utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800;

365

	    utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00;

366

367

	return 2;

368

    } else {

369

	return 0;

370

371

372

373

#if CAIRO_HAS_UTF8_TO_UTF16

374

/**

375

 * _cairo_utf8_to_utf16:

376

 * @str: an UTF-8 string

377

 * @len: length of @str in bytes, or -1 if it is nul-terminated.

378

 *   If @len is supplied and the string has an embedded nul

379

 *   byte, only the portion before the nul byte is converted.

380

 * @result: location to store a pointer to a newly allocated UTF-16

381

 *   string (always native endian). Free with free(). A 0

382

 *   word will be written after the last character.

383

 * @items_written: location to store number of 16-bit words

384

 *   written. (Not including the trailing 0)

385

386

 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode

387

 * where characters are represented either as a single 16-bit word, or

388

 * as a pair of 16-bit "surrogates". The string is validated to

389

 * consist entirely of valid Unicode characters.

390

391

 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was

392

 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an

393

 *   an invalid sequence was found.

394

**/

395

cairo_status_t

396

_cairo_utf8_to_utf16 (const char *str,

397

		      int	  len,

398

		      uint16_t **result,

399

		      int	*items_written)

400

401

    uint16_t *str16 = NULL;

402

    int n16, i;

403

    const unsigned char *in;

404

    const unsigned char * const ustr = (const unsigned char *) str;

405

406

    in = ustr;

407

    n16 = 0;

408

    while ((len < 0 || ustr + len - in > 0) && *in) {

409

	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);

410

	if (wc & 0x80000000 || !UNICODE_VALID (wc))

411

	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);

412

413

	if (wc < 0x10000)

414

	    n16 += 1;

415

	else

416

	    n16 += 2;

417

418

	if (n16 == INT_MAX - 1 || n16 == INT_MAX)

419

	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);

420

421

	in = UTF8_NEXT_CHAR (in);

422

423

424

    str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));

425

    if (!str16)

426

	return _cairo_error (CAIRO_STATUS_NO_MEMORY);

427

428

    in = ustr;

429

    for (i = 0; i < n16;) {

430

	uint32_t wc = _utf8_get_char (in);

431

432

	i += _cairo_ucs4_to_utf16 (wc, str16 + i);

433

434

	in = UTF8_NEXT_CHAR (in);

435

436

437

    str16[i] = 0;

438

439

    *result = str16;

440

    if (items_written)

441

	*items_written = n16;

442

443

    return CAIRO_STATUS_SUCCESS;

444

445

#endif