Quelle hb-utf.hh

Sprache: C

/*
* Copyright © 2011,2012,2014  Google, Inc.
*
*  This is part of HarfBuzz, a text shaping library.
*
* Permission is hereby granted, without written agreement and without
* license or royalty fees, to use, copy, modify, and distribute this
* software and its documentation for any purpose, provided that the
* above copyright notice and the following two paragraphs appear in
* all copies of this software.
*
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*
* Google Author(s): Behdad Esfahbod
*/

#ifndef HB_UTF_HH
#define HB_UTF_HH

#include "hb.hh"

#include "hb-open-type.hh"

struct hb_utf8_t
{
  typedef uint8_t codepoint_t;
  static constexpr unsigned max_len = 4;

  static const codepoint_t *
  next (const codepoint_t *text,
const codepoint_t *end,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    /* Written to only accept well-formed sequences.
     * Based on ideas from ICU's U8_NEXT.
     * Generates one "replacement" for each ill-formed byte. */

    hb_codepoint_t c = *text++;

    if (c > 0x7Fu)
    {
      if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */
      {
unsigned int t1;
if (likely (text < end &&
      (t1 = text[0] - 0x80u) <= 0x3Fu))
{
   c = ((c&0x1Fu)<<6) | t1;
   text++;
}
else
   goto error;
      }
      else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */
      {
unsigned int t1, t2;
if (likely (1 < end - text &&
      (t1 = text[0] - 0x80u) <= 0x3Fu &&
      (t2 = text[1] - 0x80u) <= 0x3Fu))
{
   c = ((c&0xFu)<<12) | (t1<<6) | t2;
   if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
     goto error;
   text += 2;
}
else
   goto error;
      }
      else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */
      {
unsigned int t1, t2, t3;
if (likely (2 < end - text &&
      (t1 = text[0] - 0x80u) <= 0x3Fu &&
      (t2 = text[1] - 0x80u) <= 0x3Fu &&
      (t3 = text[2] - 0x80u) <= 0x3Fu))
{
   c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
   if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu)))
     goto error;
   text += 3;
}
else
   goto error;
      }
      else
goto error;
    }

    *unicode = c;
    return text;

  error:
    *unicode = replacement;
    return text;
  }

  static const codepoint_t *
  prev (const codepoint_t *text,
const codepoint_t *start,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    const codepoint_t *end = text--;
    while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
      text--;

    if (likely (next (text, end, unicode, replacement) == end))
      return text;

    *unicode = replacement;
    return end - 1;
  }

  static unsigned int
  strlen (const codepoint_t *text)
  { return ::strlen ((const char *) text); }

  static unsigned int
  encode_len (hb_codepoint_t unicode)
  {
    if (unicode <   0x0080u) return 1;
    if (unicode <   0x0800u) return 2;
    if (unicode <  0x10000u) return 3;
    if (unicode < 0x110000u) return 4;
    return 3;
  }

  static codepoint_t *
  encode (codepoint_t *text,
   const codepoint_t *end,
   hb_codepoint_t unicode)
  {
    if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
      unicode = 0xFFFDu;
    if (unicode < 0x0080u)
     *text++ = unicode;
    else if (unicode < 0x0800u)
    {
      if (end - text >= 2)
      {
*text++ =  0xC0u + (0x1Fu & (unicode >>  6));
*text++ =  0x80u + (0x3Fu & (unicode      ));
      }
    }
    else if (unicode < 0x10000u)
    {
      if (end - text >= 3)
      {
*text++ =  0xE0u + (0x0Fu & (unicode >> 12));
*text++ =  0x80u + (0x3Fu & (unicode >>  6));
*text++ =  0x80u + (0x3Fu & (unicode      ));
      }
    }
    else
    {
      if (end - text >= 4)
      {
*text++ =  0xF0u + (0x07u & (unicode >> 18));
*text++ =  0x80u + (0x3Fu & (unicode >> 12));
*text++ =  0x80u + (0x3Fu & (unicode >>  6));
*text++ =  0x80u + (0x3Fu & (unicode      ));
      }
    }
    return text;
  }
};

template <typename TCodepoint>
struct hb_utf16_xe_t
{
  static_assert (sizeof (TCodepoint) == 2, "");
  typedef TCodepoint codepoint_t;
  static constexpr unsigned max_len = 2;

  static const codepoint_t *
  next (const codepoint_t *text,
const codepoint_t *end,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    hb_codepoint_t c = *text++;

    if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
    {
      *unicode = c;
      return text;
    }

    if (likely (c <= 0xDBFFu && text < end))
    {
      /* High-surrogate in c */
      hb_codepoint_t l = *text;
      if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu)))
      {
/* Low-surrogate in l */
*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
  text++;
  return text;
      }
    }

    /* Lonely / out-of-order surrogate. */
    *unicode = replacement;
    return text;
  }

  static const codepoint_t *
  prev (const codepoint_t *text,
const codepoint_t *start,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    hb_codepoint_t c = *--text;

    if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
    {
      *unicode = c;
      return text;
    }

    if (likely (c >= 0xDC00u && start < text))
    {
      /* Low-surrogate in c */
      hb_codepoint_t h = text[-1];
      if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu)))
      {
/* High-surrogate in h */
*unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
text--;
return text;
      }
    }

    /* Lonely / out-of-order surrogate. */
    *unicode = replacement;
    return text;
  }

  static unsigned int
  strlen (const codepoint_t *text)
  {
    unsigned int l = 0;
    while (*text++) l++;
    return l;
  }

  static unsigned int
  encode_len (hb_codepoint_t unicode)
  {
    return unicode < 0x10000 ? 1 : 2;
  }

  static codepoint_t *
  encode (codepoint_t *text,
   const codepoint_t *end,
   hb_codepoint_t unicode)
  {
    if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
      unicode = 0xFFFDu;
    if (unicode < 0x10000u)
     *text++ = unicode;
    else if (end - text >= 2)
    {
      unicode -= 0x10000u;
      *text++ =  0xD800u + (unicode >> 10);
      *text++ =  0xDC00u + (unicode & 0x03FFu);
    }
    return text;
  }
};

typedef hb_utf16_xe_t<uint16_t> hb_utf16_t;
typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t;

template <typename TCodepoint, bool validate=true>
struct hb_utf32_xe_t
{
  static_assert (sizeof (TCodepoint) == 4, "");
  typedef TCodepoint codepoint_t;
  static constexpr unsigned max_len = 1;

  static const TCodepoint *
  next (const TCodepoint *text,
const TCodepoint *end HB_UNUSED,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    hb_codepoint_t c = *unicode = *text++;
    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
      *unicode = replacement;
    return text;
  }

  static const TCodepoint *
  prev (const TCodepoint *text,
const TCodepoint *start HB_UNUSED,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    hb_codepoint_t c = *unicode = *--text;
    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
      *unicode = replacement;
    return text;
  }

  static unsigned int
  strlen (const TCodepoint *text)
  {
    unsigned int l = 0;
    while (*text++) l++;
    return l;
  }

  static unsigned int
  encode_len (hb_codepoint_t unicode HB_UNUSED)
  {
    return 1;
  }

  static codepoint_t *
  encode (codepoint_t *text,
   const codepoint_t *end HB_UNUSED,
   hb_codepoint_t unicode)
  {
    if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
      unicode = 0xFFFDu;
    *text++ = unicode;
    return text;
  }
};

typedef hb_utf32_xe_t<uint32_t> hb_utf32_t;
typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t;

struct hb_latin1_t
{
  typedef uint8_t codepoint_t;
  static constexpr unsigned max_len = 1;

  static const codepoint_t *
  next (const codepoint_t *text,
const codepoint_t *end HB_UNUSED,
hb_codepoint_t *unicode,
hb_codepoint_t replacement HB_UNUSED)
  {
    *unicode = *text++;
    return text;
  }

  static const codepoint_t *
  prev (const codepoint_t *text,
const codepoint_t *start HB_UNUSED,
hb_codepoint_t *unicode,
hb_codepoint_t replacement HB_UNUSED)
  {
    *unicode = *--text;
    return text;
  }

  static unsigned int
  strlen (const codepoint_t *text)
  {
    unsigned int l = 0;
    while (*text++) l++;
    return l;
  }

  static unsigned int
  encode_len (hb_codepoint_t unicode HB_UNUSED)
  {
    return 1;
  }

  static codepoint_t *
  encode (codepoint_t *text,
   const codepoint_t *end HB_UNUSED,
   hb_codepoint_t unicode)
  {
    if (unlikely (unicode >= 0x0100u))
      unicode = '?';
    *text++ = unicode;
    return text;
  }
};

struct hb_ascii_t
{
  typedef uint8_t codepoint_t;
  static constexpr unsigned max_len = 1;

  static const codepoint_t *
  next (const codepoint_t *text,
const codepoint_t *end HB_UNUSED,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    *unicode = *text++;
    if (*unicode >= 0x0080u)
      *unicode = replacement;
    return text;
  }

  static const codepoint_t *
  prev (const codepoint_t *text,
const codepoint_t *start HB_UNUSED,
hb_codepoint_t *unicode,
hb_codepoint_t replacement)
  {
    *unicode = *--text;
    if (*unicode >= 0x0080u)
      *unicode = replacement;
    return text;
  }

  static unsigned int
  strlen (const codepoint_t *text)
  {
    unsigned int l = 0;
    while (*text++) l++;
    return l;
  }

  static unsigned int
  encode_len (hb_codepoint_t unicode HB_UNUSED)
  {
    return 1;
  }

  static codepoint_t *
  encode (codepoint_t *text,
   const codepoint_t *end HB_UNUSED,
   hb_codepoint_t unicode)
  {
    if (unlikely (unicode >= 0x0080u))
      unicode = '?';
    *text++ = unicode;
    return text;
  }
};

template <typename utf_t>
static inline const typename utf_t::codepoint_t *
hb_utf_offset_to_pointer (const typename utf_t::codepoint_t *start,
     const typename utf_t::codepoint_t *text,
     unsigned text_len,
     signed offset)
{
  hb_codepoint_t unicode;

  while (offset-- > 0)
    start = utf_t::next (start,
    text + text_len,
    &unicode,
    HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT);

  while (offset++ < 0)
    start = utf_t::prev (start,
    text,
    &unicode,
    HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT);

  return start;
}

#endif /* HB_UTF_HH */

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.11 Sekunden (vorverarbeitet am 2026-06-10) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.