/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
char cEncodedChar = static_cast<char>(cChar);
cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar(); if( 0U == cChar )
{ // If the character could not be // converted, because a conversion is not // available, do no conversion at all.
cChar = cEncodedChar;
}
}
} else
nNextCh = 0U;
if (cChar == 1 || cChar == 2)
{ if( '>' == cBreak )
{ // When reading the content of a tag we have // to change it to ' ' or '-' if( 1U == cChar )
cChar = ' '; else//2U
cChar = '-';
} else
{ // If not scanning a tag return token
aToken.append( sTmpBuffer );
sTmpBuffer.setLength(0);
// Hack: _GetNextChar shall not read the // next character if( ';' != nNextCh )
aToken.append( " " ); if( 1U == cChar ) return HtmlTokenId::NONBREAKSPACE; else//2U return HtmlTokenId::SOFTHYPH;
}
}
} else
nNextCh = 0U;
} // &{...};-JavaScript-Macros are not supported any longer. elseif( IsParserWorking() )
{
sTmpBuffer.append( '&' );
bNextCh = false; break;
}
bNextCh = (';' == nNextCh); if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
cChar=='\"' || cChar==' ') )
{ // ' and " have to be escaped within tags to separate // them from ' and " enclosing options. // \ has to be escaped as well. // Space is protected because it's not a delimiter between // options.
sTmpBuffer.append( '\\' );
} if( IsParserWorking() )
{ if( cChar )
sTmpBuffer.appendUtf32( cChar );
} elseif( SvParserState::Pending==eState && '>'!=cBreak )
{ // Restart with '&', the remainder is returned as // text token. if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
{ // _GetNextChar() returns the previous text and // during the next execution a new character is read. // Thus we have to position in front of the '&'.
nNextCh = 0U;
rInput.Seek( nStreamPos - GetCharSize() );
nlLinePos = nLinePos-1;
ClearTxtConvContext();
bReadNextChar = true;
}
bNextCh = false;
}
} break; case'=': if( '>'==cBreak && !cQuote )
bEqSignFound = true;
sTmpBuffer.appendUtf32( nNextCh ); break;
case'\\': if( '>'==cBreak )
{ // mark within tags
sTmpBuffer.append( '\\' );
}
sTmpBuffer.append( '\\' ); break;
case sal_Unicode(EOF): if( rInput.eof() )
{
bContinue = false;
} // else: ignore, not a valid code point break;
case'<':
bEqSignFound = false; if( '>'==cBreak )
sTmpBuffer.appendUtf32( nNextCh ); else
bContinue = false; // break, string is together break;
case'\f': if( '>' == cBreak )
{ // If scanning options treat it like a space, ...
sTmpBuffer.append( ' ' );
} else
{ // otherwise it's a separate token.
bContinue = false;
} break;
case'\r': case'\n': if( '>'==cBreak )
{ // cr/lf in tag is handled in GetNextToken_()
sTmpBuffer.appendUtf32( nNextCh ); break;
} elseif( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
{
bContinue = false; break;
} // Reduce sequence of CR/LF/BLANK/TAB to a single blank
[[fallthrough]]; case'\t': if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
{ // Pass Tabs up in <PRE>
bContinue = false; break;
}
[[fallthrough]]; case'\x0b': if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) && '>'!=cBreak )
{ break;
} if (!m_bPreserveSpaces)
nNextCh = ' ';
[[fallthrough]]; case' ': if (!m_bPreserveSpaces)
{
sTmpBuffer.appendUtf32(nNextCh); if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
{ // Reduce sequences of Blanks/Tabs/CR/LF to a single blank do
{
nNextCh = GetNextChar(); if (sal_Unicode(EOF) == nNextCh && rInput.eof())
{ if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
{ // Have seen s.th. aside from blanks?
aToken.append(sTmpBuffer);
sTmpBuffer.setLength(0); return HtmlTokenId::TEXTTOKEN;
} else // Only read blanks: no text must be returned // and GetNextToken_ has to read until EOF return HtmlTokenId::NONE;
}
} while (HTML_ISSPACE(nNextCh));
bNextCh = false;
} break;
}
[[fallthrough]]; default:
bEqSignFound = false; if (nNextCh == cBreak && !cQuote)
bContinue = false; else
{ do { if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
{ // All remaining characters make their way into the text.
sTmpBuffer.appendUtf32( nNextCh );
}
if( bEndTokenFound )
{ // During the last execution we already found the end token, // thus we don't have to search it again.
bReadScript = false;
bReadStyle = false;
aEndToken.clear();
bEndTokenFound = false;
// If it's a token which can be switched off... if( bOffState )
{ if( nRet >= HtmlTokenId::ONOFF_START )
{ // and there is an off token, return off token instead
nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
} elseif( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
{ // and there is no off token, return unknown token. // (except for </BR>, that is treated like <BR>) // No exception for XHTML, though.
nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
}
}
if( nRet == HtmlTokenId::COMMENT )
{ // fix: due to being case sensitive use sSaveToken as start of comment // and append a blank.
aToken = sSaveToken; if( '>'!=nNextCh )
aToken.append( " " );
sal_uInt64 nCStreamPos = 0;
sal_uInt32 nCLineNr = 0;
sal_uInt32 nCLinePos = 0;
sal_Int32 nCStrLen = 0;
// fdo#34666 fdo#36080 fdo#36390: closing "/>"?: // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF // which lead to fdo#56772. if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
{
mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
aToken.setLength( aToken.getLength()-1 ); // remove trailing '/'
} if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
{ // Move back in front of < and restart there. // Return < as text.
rInput.Seek( nStreamPos );
SetLineNr( nLineNr );
SetLinePos( nLinePos );
ClearTxtConvContext();
const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
{ // If the options for the current token have already been returned, // return them once again. if (!maOptions.empty()) return maOptions;
// Actually only certain characters allowed. // Netscape only looks for "=" and white space (c.f. // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c) while( nPos < aToken.getLength() )
{
cChar = aToken[nPos]; if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) ) break;
nPos++;
}
// Token is known and can be saved
maOptions.emplace_back(nToken, sName, aValue);
} else // Ignore white space and unexpected characters
nPos++;
}
return maOptions;
}
HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
{ switch( nToken )
{ // in Netscape they only have impact in not empty paragraphs case HtmlTokenId::PARABREAK_ON:
nToken = HtmlTokenId::LINEBREAK;
[[fallthrough]]; case HtmlTokenId::LINEBREAK: case HtmlTokenId::NEWPARA:
nPre_LinePos = 0; if( bPre_IgnoreNewPara )
nToken = HtmlTokenId::NONE; break;
case HtmlTokenId::TABCHAR:
{
sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" ); if (aToken.getLength() < nSpaces)
{ using comphelper::string::padToLength;
OUStringBuffer aBuf(aToken);
aToken = padToLength(aBuf, nSpaces, ' ');
}
nPre_LinePos += nSpaces;
nToken = HtmlTokenId::TEXTTOKEN;
} break; // Keep those case HtmlTokenId::TEXTTOKEN:
nPre_LinePos += aToken.getLength(); break;
case HtmlTokenId::SELECT_ON: case HtmlTokenId::SELECT_OFF: case HtmlTokenId::BODY_ON: case HtmlTokenId::FORM_ON: case HtmlTokenId::FORM_OFF: case HtmlTokenId::INPUT: case HtmlTokenId::OPTION: case HtmlTokenId::TEXTAREA_ON: case HtmlTokenId::TEXTAREA_OFF:
case HtmlTokenId::IMAGE: case HtmlTokenId::APPLET_ON: case HtmlTokenId::APPLET_OFF: case HtmlTokenId::PARAM: case HtmlTokenId::EMBED:
case HtmlTokenId::HEAD1_ON: case HtmlTokenId::HEAD1_OFF: case HtmlTokenId::HEAD2_ON: case HtmlTokenId::HEAD2_OFF: case HtmlTokenId::HEAD3_ON: case HtmlTokenId::HEAD3_OFF: case HtmlTokenId::HEAD4_ON: case HtmlTokenId::HEAD4_OFF: case HtmlTokenId::HEAD5_ON: case HtmlTokenId::HEAD5_OFF: case HtmlTokenId::HEAD6_ON: case HtmlTokenId::HEAD6_OFF: case HtmlTokenId::BLOCKQUOTE_ON: case HtmlTokenId::BLOCKQUOTE_OFF: case HtmlTokenId::ADDRESS_ON: case HtmlTokenId::ADDRESS_OFF: case HtmlTokenId::HORZRULE:
case HtmlTokenId::CENTER_ON: case HtmlTokenId::CENTER_OFF: case HtmlTokenId::DIVISION_ON: case HtmlTokenId::DIVISION_OFF:
case HtmlTokenId::SCRIPT_ON: case HtmlTokenId::SCRIPT_OFF: case HtmlTokenId::RAWDATA:
case HtmlTokenId::TABLE_ON: case HtmlTokenId::TABLE_OFF: case HtmlTokenId::CAPTION_ON: case HtmlTokenId::CAPTION_OFF: case HtmlTokenId::COLGROUP_ON: case HtmlTokenId::COLGROUP_OFF: case HtmlTokenId::COL_ON: case HtmlTokenId::COL_OFF: case HtmlTokenId::THEAD_ON: case HtmlTokenId::THEAD_OFF: case HtmlTokenId::TFOOT_ON: case HtmlTokenId::TFOOT_OFF: case HtmlTokenId::TBODY_ON: case HtmlTokenId::TBODY_OFF: case HtmlTokenId::TABLEROW_ON: case HtmlTokenId::TABLEROW_OFF: case HtmlTokenId::TABLEDATA_ON: case HtmlTokenId::TABLEDATA_OFF: case HtmlTokenId::TABLEHEADER_ON: case HtmlTokenId::TABLEHEADER_OFF:
case HtmlTokenId::ANCHOR_ON: case HtmlTokenId::ANCHOR_OFF: case HtmlTokenId::BOLD_ON: case HtmlTokenId::BOLD_OFF: case HtmlTokenId::ITALIC_ON: case HtmlTokenId::ITALIC_OFF: case HtmlTokenId::STRIKE_ON: case HtmlTokenId::STRIKE_OFF: case HtmlTokenId::STRIKETHROUGH_ON: case HtmlTokenId::STRIKETHROUGH_OFF: case HtmlTokenId::UNDERLINE_ON: case HtmlTokenId::UNDERLINE_OFF: case HtmlTokenId::BASEFONT_ON: case HtmlTokenId::BASEFONT_OFF: case HtmlTokenId::FONT_ON: case HtmlTokenId::FONT_OFF: case HtmlTokenId::BLINK_ON: case HtmlTokenId::BLINK_OFF: case HtmlTokenId::SPAN_ON: case HtmlTokenId::SPAN_OFF: case HtmlTokenId::SUBSCRIPT_ON: case HtmlTokenId::SUBSCRIPT_OFF: case HtmlTokenId::SUPERSCRIPT_ON: case HtmlTokenId::SUPERSCRIPT_OFF: case HtmlTokenId::BIGPRINT_ON: case HtmlTokenId::BIGPRINT_OFF: case HtmlTokenId::SMALLPRINT_OFF: case HtmlTokenId::SMALLPRINT_ON:
case HtmlTokenId::EMPHASIS_ON: case HtmlTokenId::EMPHASIS_OFF: case HtmlTokenId::CITATION_ON: case HtmlTokenId::CITATION_OFF: case HtmlTokenId::STRONG_ON: case HtmlTokenId::STRONG_OFF: case HtmlTokenId::CODE_ON: case HtmlTokenId::CODE_OFF: case HtmlTokenId::SAMPLE_ON: case HtmlTokenId::SAMPLE_OFF: case HtmlTokenId::KEYBOARD_ON: case HtmlTokenId::KEYBOARD_OFF: case HtmlTokenId::VARIABLE_ON: case HtmlTokenId::VARIABLE_OFF: case HtmlTokenId::DEFINSTANCE_ON: case HtmlTokenId::DEFINSTANCE_OFF: case HtmlTokenId::SHORTQUOTE_ON: case HtmlTokenId::SHORTQUOTE_OFF: case HtmlTokenId::LANGUAGE_ON: case HtmlTokenId::LANGUAGE_OFF: case HtmlTokenId::AUTHOR_ON: case HtmlTokenId::AUTHOR_OFF: case HtmlTokenId::PERSON_ON: case HtmlTokenId::PERSON_OFF: case HtmlTokenId::ACRONYM_ON: case HtmlTokenId::ACRONYM_OFF: case HtmlTokenId::ABBREVIATION_ON: case HtmlTokenId::ABBREVIATION_OFF: case HtmlTokenId::INSERTEDTEXT_ON: case HtmlTokenId::INSERTEDTEXT_OFF: case HtmlTokenId::DELETEDTEXT_ON: case HtmlTokenId::DELETEDTEXT_OFF: case HtmlTokenId::TELETYPE_ON: case HtmlTokenId::TELETYPE_OFF:
break;
// The remainder is treated as an unknown token. default: if( nToken != HtmlTokenId::NONE )
{
nToken =
( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
? HtmlTokenId::UNKNOWNCONTROL_OFF
: HtmlTokenId::UNKNOWNCONTROL_ON );
} break;
}
bPre_IgnoreNewPara = false;
return nToken;
}
HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
{ switch( nToken )
{ case HtmlTokenId::NEWPARA: if( bPre_IgnoreNewPara )
nToken = HtmlTokenId::NONE;
[[fallthrough]]; case HtmlTokenId::TEXTTOKEN: case HtmlTokenId::NONBREAKSPACE: case HtmlTokenId::SOFTHYPH: break; // kept
for ( size_t i = aOptions.size(); i; )
{ const HTMLOption& aOption = aOptions[--i]; switch ( aOption.GetToken() )
{ case HtmlOptionId::NAME:
aName = aOption.GetString(); if ( HtmlMeta::NONE==nAction )
{
aOption.GetEnum( nAction, aHTMLMetaNameTable );
} break; case HtmlOptionId::HTTPEQUIV:
aName = aOption.GetString();
aOption.GetEnum( nAction, aHTMLMetaNameTable );
bHTTPEquiv = true; break; case HtmlOptionId::CONTENT:
aContent = aOption.GetString(); break; case HtmlOptionId::CHARSET:
{
OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr())); break;
} default: break;
}
}
if ( bHTTPEquiv || HtmlMeta::Description != nAction )
{ // if it is not a Description, remove CRs and LFs from CONTENT
aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
} else
{ // convert line endings for Description
aContent = convertLineEnd(aContent, GetSystemLineEnd());
}
if ( bHTTPEquiv && i_pHTTPHeader )
{ // Netscape seems to just ignore a closing ", so we do too if ( aContent.endsWith("\"") )
{
aContent = aContent.copy( 0, aContent.getLength() - 1 );
}
SvKeyValue aKeyValue( aName, aContent );
i_pHTTPHeader->Append( aKeyValue );
}
switch ( nAction )
{ case HtmlMeta::Author: if (i_xDocProps.is()) {
i_xDocProps->setAuthor( aContent );
bChanged = true;
} break; case HtmlMeta::Description: if (i_xDocProps.is()) {
i_xDocProps->setDescription( aContent );
bChanged = true;
} break; case HtmlMeta::Keywords: if (i_xDocProps.is()) {
i_xDocProps->setKeywords(
::comphelper::string::convertCommaSeparated(aContent));
bChanged = true;
} break; case HtmlMeta::Classification: if (i_xDocProps.is()) {
i_xDocProps->setSubject( aContent );
bChanged = true;
} break;
case HtmlMeta::ChangedBy: if (i_xDocProps.is()) {
i_xDocProps->setModifiedBy( aContent );
bChanged = true;
} break;
// If the encoding is set by a META tag, it may only overwrite the // current encoding if both, the current and the new encoding, are 1-sal_uInt8 // encodings. Everything else cannot lead to reasonable results. if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
rtl_isOctetTextEncoding( eEnc ) &&
rtl_isOctetTextEncoding( GetSrcEncoding() ) )
{
eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
SetSrcEncoding( eEnc );
}
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.33Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-05-08)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.