/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <sal/config.h>
#include <cassert>
#include "converter.hxx"
#include "convertisciidevangari.hxx"
#include "convertsinglebytetobmpunicode.hxx"
#include <rtl/character.hxx>
#include <rtl/textcvt.h>
using namespace sal::detail::textenc;
using namespace rtl::textenc;
namespace {
struct IsciiDevanagariToUnicode
{
sal_uInt8 m_cPrevChar;
IsciiDevanagariToUnicode()
: m_cPrevChar(
0 )
{
}
void reset()
{
m_cPrevChar =
0 ;
}
sal_Size convert(
char const * pSrcBuf, sal_Size nSrcBytes,
sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
sal_uInt32* pInfo, sal_Size* pSrcCvtBytes);
};
struct UnicodeToIsciiDevanagari
{
sal_Unicode m_cPrevChar;
sal_Unicode m_cHighSurrogate;
UnicodeToIsciiDevanagari()
: m_cPrevChar(
0 )
, m_cHighSurrogate(
0 )
{
}
void reset()
{
m_cPrevChar =
0 ;
m_cHighSurrogate =
0 ;
}
sal_Size convert(sal_Unicode
const * pSrcBuf, sal_Size nSrcChars,
char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
sal_uInt32 * pInfo, sal_Size * pSrcCvtChars);
};
}
const sal_Unicode IsciiDevanagariMap[
256 ] =
{
0 x0000,
0 x0001,
0 x0002,
0 x0003,
0 x0004,
0 x0005,
0 x0006,
0 x0007,
0 x0008,
0 x0009,
0 x000A,
0 x000B,
0 x000C,
0 x000D,
0 x000E,
0 x000F,
0 x0010,
0 x0011,
0 x0012,
0 x0013,
0 x0014,
0 x0015,
0 x0016,
0 x0017,
0 x0018,
0 x0019,
0 x001A,
0 x001B,
0 x001C,
0 x001D,
0 x001E,
0 x001F,
0 x0020,
0 x0021,
0 x0022,
0 x0023,
0 x0024,
0 x0025,
0 x0026,
0 x0027,
0 x0028,
0 x0029,
0 x002A,
0 x002B,
0 x002C,
0 x002D,
0 x002E,
0 x002F,
0 x0030,
0 x0031,
0 x0032,
0 x0033,
0 x0034,
0 x0035,
0 x0036,
0 x0037,
0 x0038,
0 x0039,
0 x003A,
0 x003B,
0 x003C,
0 x003D,
0 x003E,
0 x003F,
0 x0040,
0 x0041,
0 x0042,
0 x0043,
0 x0044,
0 x0045,
0 x0046,
0 x0047,
0 x0048,
0 x0049,
0 x004A,
0 x004B,
0 x004C,
0 x004D,
0 x004E,
0 x004F,
0 x0050,
0 x0051,
0 x0052,
0 x0053,
0 x0054,
0 x0055,
0 x0056,
0 x0057,
0 x0058,
0 x0059,
0 x005A,
0 x005B,
0 x005C,
0 x005D,
0 x005E,
0 x005F,
0 x0060,
0 x0061,
0 x0062,
0 x0063,
0 x0064,
0 x0065,
0 x0066,
0 x0067,
0 x0068,
0 x0069,
0 x006A,
0 x006B,
0 x006C,
0 x006D,
0 x006E,
0 x006F,
0 x0070,
0 x0071,
0 x0072,
0 x0073,
0 x0074,
0 x0075,
0 x0076,
0 x0077,
0 x0078,
0 x0079,
0 x007A,
0 x007B,
0 x007C,
0 x007D,
0 x007E,
0 x007F,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 x0901,
0 x0902,
0 x0903,
0 x0905,
0 x0906,
0 x0907,
0 x0908,
0 x0909,
0 x090A,
0 x090B,
0 x090E,
0 x090F,
0 x0910,
0 x090D,
0 x0912,
0 x0913,
0 x0914,
0 x0911,
0 x0915,
0 x0916,
0 x0917,
0 x0918,
0 x0919,
0 x091A,
0 x091B,
0 x091C,
0 x091D,
0 x091E,
0 x091F,
0 x0920,
0 x0921,
0 x0922,
0 x0923,
0 x0924,
0 x0925,
0 x0926,
0 x0927,
0 x0928,
0 x0929,
0 x092A,
0 x092B,
0 x092C,
0 x092D,
0 x092E,
0 x092F,
0 x095F,
0 x0930,
0 x0931,
0 x0932,
0 x0933,
0 x0934,
0 x0935,
0 x0936,
0 x0937,
0 x0938,
0 x0939,
0 xFFFF,
0 x093E,
0 x093F,
0 x0940,
0 x0941,
0 x0942,
0 x0943,
0 x0946,
0 x0947,
0 x0948,
0 x0945,
0 x094A,
0 x094B,
0 x094C,
0 x0949,
0 x094D,
0 x093C,
0 x0964,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 x0966,
0 x0967,
0 x0968,
0 x0969,
0 x096A,
0 x096B,
0 x096C,
0 x096D,
0 x096E,
0 x096F,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF,
0 xFFFF
};
sal_Size IsciiDevanagariToUnicode::convert(
char const * pSrcBuf, sal_Size nSrcBytes,
sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
{
sal_uInt32 nInfo =
0 ;
sal_Size nConverted =
0 ;
sal_Unicode* pDestBufPtr = pDestBuf;
sal_Unicode* pDestBufEnd = pDestBuf + nDestChars;
sal_Size startOfCurrentChar =
0 ;
while (nConverted < nSrcBytes)
{
if (pDestBufPtr == pDestBufEnd)
{
nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
break ;
}
sal_Unicode cChar = sal_Unicode();
sal_uInt8 nIn =
static_cast <sal_uInt8>(pSrcBuf[nConverted]);
sal_uInt8 nNext = nConverted < nSrcBytes +
1 ?
static_cast <sal_uInt8>(pSrcBuf[nConverted+
1 ])
: 0 ;
bool bNormal = true ;
bool bDouble = false ;
//halant + halant E8 E8 -> halant + ZWNJ 094D 200C
//halant + nukta E8 E9 halant + ZWJ 094D 200D
if (m_cPrevChar == 0 xE8 && nIn == 0 xE8)
{
cChar = 0 x200C;
bNormal = false ;
}
else if (m_cPrevChar == 0 xE8 && nIn == 0 xE9)
{
cChar = 0 x200D;
bNormal = false ;
}
else if (nNext == 0 xE9)
{
bNormal = false ;
bDouble = true ;
switch (nIn)
{
case 0 xA1:
cChar = 0 x0950;
break ;
case 0 xA6:
cChar = 0 x090C;
break ;
case 0 xA7:
cChar = 0 x0961;
break ;
case 0 xAA:
cChar = 0 x0960;
break ;
case 0 xB3:
cChar = 0 x0958;
break ;
case 0 xB4:
cChar = 0 x0959;
break ;
case 0 xB5:
cChar = 0 x095A;
break ;
case 0 xBA:
cChar = 0 x095B;
break ;
case 0 xBF:
cChar = 0 x095C;
break ;
case 0 xC0:
cChar = 0 x095D;
break ;
case 0 xC9:
cChar = 0 x095E;
break ;
case 0 xDB:
cChar = 0 x0962;
break ;
case 0 xDC:
cChar = 0 x0963;
break ;
case 0 xDF:
cChar = 0 x0944;
break ;
case 0 xEA:
cChar = 0 x093D;
break ;
default :
bNormal = true ;
bDouble = false ;
break ;
}
}
++nConverted;
if (bDouble)
++nConverted;
if (bNormal)
cChar = IsciiDevanagariMap[nIn];
bool bUndefined = cChar == 0 xffff;
if (bUndefined)
{
BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion(
bUndefined, true , 0 , nFlags, &pDestBufPtr, pDestBufEnd,
&nInfo);
if (eAction == BAD_INPUT_CONTINUE) {
startOfCurrentChar = nConverted;
continue ;
}
if (eAction == BAD_INPUT_STOP) {
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0 ) {
nConverted = startOfCurrentChar;
}
break ;
}
assert(eAction == BAD_INPUT_NO_OUTPUT);
nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
break ;
}
*pDestBufPtr++ = cChar;
m_cPrevChar = bNormal ? nIn : 0 ;
startOfCurrentChar = nConverted;
}
if (pInfo)
*pInfo = nInfo;
if (pSrcCvtBytes)
*pSrcCvtBytes = nConverted;
return pDestBufPtr - pDestBuf;
}
BmpUnicodeToSingleByteRange const unicodeToISCIIEncoding[] =
{
{ 0 x0000, 0 x007F - 0 x0000, 0 x00 }, { 0 x0901, 0 x0903 - 0 x0901, 0 xA1 },
{ 0 x0905, 0 x090B - 0 x0905, 0 xA4 }, { 0 x090D, 0 x090D - 0 x090D, 0 xAE },
{ 0 x090E, 0 x0910 - 0 x090E, 0 xAB }, { 0 x0911, 0 x0911 - 0 x0911, 0 xB2 },
{ 0 x0912, 0 x0914 - 0 x0912, 0 xAF }, { 0 x0915, 0 x092F - 0 x0915, 0 xB3 },
{ 0 x0930, 0 x0939 - 0 x0930, 0 xCF }, { 0 x093C, 0 x093C - 0 x093C, 0 xE9 },
{ 0 x093E, 0 x0943 - 0 x093E, 0 xDA }, { 0 x0945, 0 x0945 - 0 x0945, 0 xE3 },
{ 0 x0946, 0 x0948 - 0 x0946, 0 xE0 }, { 0 x0949, 0 x0949 - 0 x0949, 0 xE7 },
{ 0 x094A, 0 x094C - 0 x094A, 0 xE4 }, { 0 x094D, 0 x094D - 0 x094D, 0 xE8 },
{ 0 x095F, 0 x095F - 0 x095F, 0 xCE }, { 0 x0964, 0 x0964 - 0 x0964, 0 xEA },
{ 0 x0966, 0 x096F - 0 x0966, 0 xF1 }
};
sal_Size UnicodeToIsciiDevanagari::convert(sal_Unicode const * pSrcBuf, sal_Size nSrcChars,
char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
{
size_t const entries = SAL_N_ELEMENTS(unicodeToISCIIEncoding);
BmpUnicodeToSingleByteRange const * ranges = unicodeToISCIIEncoding;
sal_Unicode cHighSurrogate = m_cHighSurrogate;
sal_uInt32 nInfo = 0 ;
sal_Size nConverted = 0 ;
char * pDestBufPtr = pDestBuf;
char * pDestBufEnd = pDestBuf + nDestBytes;
for (; nConverted < nSrcChars; ++nConverted)
{
bool bUndefined = true ;
sal_uInt32 c = *pSrcBuf++;
char cSpecialChar = 0 ;
if (cHighSurrogate == 0 )
{
if (rtl::isHighSurrogate(c))
{
cHighSurrogate = static_cast < sal_Unicode >(c);
continue ;
}
else if (rtl::isLowSurrogate(c))
{
bUndefined = false ;
goto bad_input;
}
}
else if (rtl::isLowSurrogate(c))
{
c = rtl::combineSurrogates(cHighSurrogate, c);
}
else
{
bUndefined = false ;
goto bad_input;
}
assert(rtl::isUnicodeScalarValue(c));
//halant + halant E8 E8 -> halant + ZWNJ 094D 200C
//halant + nukta E8 E9 halant + ZWJ 094D 200D
if (m_cPrevChar == 0 x094D && c == 0 x200C)
cSpecialChar = '\xE8' ;
else if (m_cPrevChar == 0 x094D && c == 0 x200D)
cSpecialChar = '\xE9' ;
if (cSpecialChar)
{
if (pDestBufEnd - pDestBufPtr < 1 )
{
nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
break ;
}
*pDestBufPtr++ = cSpecialChar;
m_cPrevChar = 0 ;
goto done;
}
switch (c)
{
case 0 x0950:
cSpecialChar = '\xA1' ;
break ;
case 0 x090C:
cSpecialChar = '\xA6' ;
break ;
case 0 x0961:
cSpecialChar = '\xA7' ;
break ;
case 0 x0960:
cSpecialChar = '\xAA' ;
break ;
case 0 x0958:
cSpecialChar = '\xB3' ;
break ;
case 0 x0959:
cSpecialChar = '\xB4' ;
break ;
case 0 x095A:
cSpecialChar = '\xB5' ;
break ;
case 0 x095B:
cSpecialChar = '\xBA' ;
break ;
case 0 x095C:
cSpecialChar = '\xBF' ;
break ;
case 0 x095D:
cSpecialChar = '\xC0' ;
break ;
case 0 x095E:
cSpecialChar = '\xC9' ;
break ;
case 0 x0962:
cSpecialChar = '\xDB' ;
break ;
case 0 x0963:
cSpecialChar = '\xDC' ;
break ;
case 0 x0944:
cSpecialChar = '\xDF' ;
break ;
case 0 x093D:
cSpecialChar = '\xEA' ;
break ;
default :
break ;
}
if (cSpecialChar)
{
if (pDestBufEnd - pDestBufPtr < 2 )
{
nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
break ;
}
*pDestBufPtr++ = cSpecialChar;
*pDestBufPtr++ = '\xE9' ;
m_cPrevChar = 0 ;
goto done;
}
// Linearly searching through the ranges if probably fastest, assuming
// that most converted characters belong to the ASCII subset:
for (size_t i = 0 ; i < entries; ++i)
{
if (c < ranges[i].unicode)
{
break ;
}
if (c <= sal::static_int_cast< sal_uInt32 >(
ranges[i].unicode + ranges[i].range))
{
if (pDestBufEnd - pDestBufPtr < 1 )
{
goto no_output;
}
*pDestBufPtr++ = static_cast < char >(
ranges[i].byte + (c - ranges[i].unicode));
m_cPrevChar = c;
goto done;
}
}
goto bad_input;
done:
cHighSurrogate = 0 ;
continue ;
bad_input:
switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
bUndefined, c, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
0 , nullptr))
{
case sal::detail::textenc::BAD_INPUT_STOP:
cHighSurrogate = 0 ;
break ;
case sal::detail::textenc::BAD_INPUT_CONTINUE:
cHighSurrogate = 0 ;
continue ;
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
goto no_output;
}
break ;
no_output:
--pSrcBuf;
nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
break ;
}
if (cHighSurrogate != 0
&& ((nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
== 0 ))
{
if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0 )
{
nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
}
else
{
switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
false , 0 , nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
0 , nullptr))
{
case sal::detail::textenc::BAD_INPUT_STOP:
case sal::detail::textenc::BAD_INPUT_CONTINUE:
cHighSurrogate = 0 ;
break ;
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
break ;
}
}
}
m_cHighSurrogate = cHighSurrogate;
if (pInfo)
*pInfo = nInfo;
if (pSrcCvtChars)
*pSrcCvtChars = nConverted;
return pDestBufPtr - pDestBuf;
}
sal_Size ImplConvertIsciiDevanagariToUnicode(void const *,
void * pContext, char const * pSrcBuf, sal_Size nSrcBytes,
sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
{
IsciiDevanagariToUnicode *pCtx =
static_cast <IsciiDevanagariToUnicode*>(pContext);
return pCtx->convert(pSrcBuf, nSrcBytes, pDestBuf, nDestChars, nFlags,
pInfo, pSrcCvtBytes);
}
sal_Size ImplConvertUnicodeToIsciiDevanagari(void const *,
void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars,
char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
{
UnicodeToIsciiDevanagari *pCtx =
static_cast <UnicodeToIsciiDevanagari*>(pContext);
return pCtx->convert(pSrcBuf, nSrcChars,
pDestBuf, nDestBytes, nFlags, pInfo, pSrcCvtChars);
}
void *ImplCreateIsciiDevanagariToUnicodeContext()
{
return new IsciiDevanagariToUnicode;
}
void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext)
{
IsciiDevanagariToUnicode *pCtx =
static_cast <IsciiDevanagariToUnicode*>(pContext);
delete pCtx;
}
void ImplResetIsciiDevanagariToUnicodeContext(void * pContext)
{
IsciiDevanagariToUnicode *pCtx =
static_cast <IsciiDevanagariToUnicode*>(pContext);
pCtx->reset();
}
void *ImplCreateUnicodeToIsciiDevanagariContext()
{
return new UnicodeToIsciiDevanagari;
}
void ImplResetUnicodeToIsciiDevanagariContext(void * pContext)
{
UnicodeToIsciiDevanagari *pCtx =
static_cast <UnicodeToIsciiDevanagari*>(pContext);
pCtx->reset();
}
void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext)
{
UnicodeToIsciiDevanagari *pCtx =
static_cast <UnicodeToIsciiDevanagari*>(pContext);
delete pCtx;
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Messung V0.5 in Prozent C=99 H=90 G=94
¤ Dauer der Verarbeitung: 0.14 Sekunden
(vorverarbeitet am 2026-06-04)
¤
*© Formatika GbR, Deutschland