/** * A class to index a float array as a 1D Array without owning the pointer or * copy the data.
*/ class ConstArray1D : public ReadArray1D { public:
ConstArray1D() : data_(nullptr), d1_(0) {}
// Init the object, the object does not own the data nor copy. // It is designed to directly use data from memory mapped resources. void init(const int32_t* data, int32_t d1) {
U_ASSERT(IEEE_754 == 1);
data_ = reinterpret_cast<constfloat*>(data);
d1_ = d1;
}
/** * A class to index a float array as a 2D Array without owning the pointer or * copy the data.
*/ class ConstArray2D : public ReadArray2D { public:
ConstArray2D() : data_(nullptr), d1_(0), d2_(0) {}
// Init the object, the object does not own the data nor copy. // It is designed to directly use data from memory mapped resources. void init(const int32_t* data, int32_t d1, int32_t d2) {
U_ASSERT(IEEE_754 == 1);
data_ = reinterpret_cast<constfloat*>(data);
d1_ = d1;
d2_ = d2;
}
/** * A class to allocate data as a writable 1D array. * This is the main class implement matrix operation.
*/ class Array1D : public ReadArray1D { public:
Array1D() : memory_(nullptr), data_(nullptr), d1_(0) {}
Array1D(int32_t d1, UErrorCode &status)
: memory_(uprv_malloc(d1 * sizeof(float))),
data_(static_cast<float*>(memory_)), d1_(d1) { if (U_SUCCESS(status)) { if (memory_ == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return;
}
clear();
}
}
virtual ~Array1D();
// A special constructor which does not own the memory but writeable // as a slice of an array.
Array1D(float* data, int32_t d1)
: memory_(nullptr), data_(data), d1_(d1) {}
// Return the index which point to the max data in the array. inline int32_t maxIndex() const {
int32_t index = 0; float max = data_[0]; for (int32_t i = 1; i < d1_; i++) { if (data_[i] > max) {
max = data_[i];
index = i;
}
} return index;
}
// Slice part of the array to a new one. inline Array1D slice(int32_t from, int32_t size) const {
U_ASSERT(from >= 0);
U_ASSERT(from < d1_);
U_ASSERT(from + size <= d1_); return Array1D(data_ + from, size);
}
// Add dot product of a 1D array and a 2D array into this one. inline Array1D& addDotProduct(const ReadArray1D& a, const ReadArray2D& b) {
U_ASSERT(a.d1() == b.d1());
U_ASSERT(b.d2() == d1()); for (int32_t i = 0; i < d1(); i++) { for (int32_t j = 0; j < a.d1(); j++) {
data_[i] += a.get(j) * b.get(j, i);
}
} return *this;
}
// Hadamard Product the values of another array of the same size into this one. inline Array1D& hadamardProduct(const ReadArray1D& a) {
U_ASSERT(a.d1() == d1()); for (int32_t i = 0; i < d1(); i++) {
data_[i] *= a.get(i);
} return *this;
}
// Add the Hadamard Product of two arrays of the same size into this one. inline Array1D& addHadamardProduct(const ReadArray1D& a, const ReadArray1D& b) {
U_ASSERT(a.d1() == d1());
U_ASSERT(b.d1() == d1()); for (int32_t i = 0; i < d1(); i++) {
data_[i] += a.get(i) * b.get(i);
} return *this;
}
// Add the values of another array of the same size into this one. inline Array1D& add(const ReadArray1D& a) {
U_ASSERT(a.d1() == d1()); for (int32_t i = 0; i < d1(); i++) {
data_[i] += a.get(i);
} return *this;
}
// Assign the values of another array of the same size into this one. inline Array1D& assign(const ReadArray1D& a) {
U_ASSERT(a.d1() == d1()); for (int32_t i = 0; i < d1(); i++) {
data_[i] = a.get(i);
} return *this;
}
// Apply tanh to all the elements in the array. inline Array1D& tanh() { return tanh(*this);
}
// Apply tanh of a and store into this array. inline Array1D& tanh(const Array1D& a) {
U_ASSERT(a.d1() == d1()); for (int32_t i = 0; i < d1_; i++) {
data_[i] = std::tanh(a.get(i));
} return *this;
}
// Apply sigmoid to all the elements in the array. inline Array1D& sigmoid() { for (int32_t i = 0; i < d1_; i++) {
data_[i] = 1.0f/(1.0f + expf(-data_[i]));
} return *this;
}
void GraphemeClusterVectorizer::vectorize(
UText *text, int32_t startPos, int32_t endPos,
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
{ if (U_FAILURE(status)) return; if (!offsets.ensureCapacity(endPos - startPos, status) ||
!indices.ensureCapacity(endPos - startPos, status)) { return;
} if (U_FAILURE(status)) return;
LocalPointer<BreakIterator> graphemeIter(BreakIterator::createCharacterInstance(Locale(), status)); if (U_FAILURE(status)) return;
graphemeIter->setText(text, status); if (U_FAILURE(status)) return;
if (startPos != 0) {
graphemeIter->preceding(startPos);
}
int32_t last = startPos;
int32_t current = startPos;
char16_t str[MAX_GRAPHEME_CLSTER_LENGTH]; while ((current = graphemeIter->next()) != BreakIterator::DONE) { if (current >= endPos) { break;
} if (current > startPos) {
utext_extract(text, last, current, str, MAX_GRAPHEME_CLSTER_LENGTH, &status); if (U_FAILURE(status)) return;
offsets.addElement(last, status);
indices.addElement(stringToIndex(str), status); if (U_FAILURE(status)) return;
}
last = current;
} if (U_FAILURE(status) || last >= endPos) { return;
}
utext_extract(text, last, endPos, str, MAX_GRAPHEME_CLSTER_LENGTH, &status); if (U_SUCCESS(status)) {
offsets.addElement(last, status);
indices.addElement(stringToIndex(str), status);
}
}
// Computing LSTM as stated in // https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate // ifco is temp array allocate outside which does not need to be // input/output value but could avoid unnecessary memory alloc/free if passing // in. void compute(
int32_t hunits, const ReadArray2D& W, const ReadArray2D& U, const ReadArray1D& b, const ReadArray1D& x, Array1D& h, Array1D& c,
Array1D& ifco)
{ // ifco = x * W + h * U + b
ifco.assign(b)
.addDotProduct(x, W)
.addDotProduct(h, U);
// ----- Begin of all the Array memory allocation needed for this function // Allocate temp array used inside compute()
Array1D ifco(4 * hunits, status);
// TODO: limit size of hBackward. If input_seq_len is too big, we could // run out of memory. // Backward LSTM
Array2D hBackward(input_seq_len, hunits, status);
// Allocate fbRow and slice the internal array in two.
Array1D fbRow(2 * hunits, status);
// ----- End of all the Array memory allocation needed for this function if (U_FAILURE(status)) return 0;
// To save the needed memory usage, the following is different from the // Python or ICU4X implementation. We first perform the Backward LSTM // and then merge the iteration of the forward LSTM and the output layer // together because we only neetdto remember the h[t-1] for Forward LSTM. for (int32_t i = input_seq_len - 1; i >= 0; i--) {
Array1D hRow = hBackward.row(i); if (i != input_seq_len - 1) {
hRow.assign(hBackward.row(i+1));
} #ifdef LSTM_DEBUG
printf("hRow %d\n", i);
hRow.print();
printf("indicesBuf[%d] = %d\n", i, indicesBuf[i]);
printf("fData->fEmbedding.row(indicesBuf[%d]):\n", i);
fData->fEmbedding.row(indicesBuf[i]).print(); #endif// LSTM_DEBUG
compute(hunits,
fData->fBackwardW, fData->fBackwardU, fData->fBackwardB,
fData->fEmbedding.row(indicesBuf[i]),
hRow, c, ifco);
}
Array1D forwardRow = fbRow.slice(0, hunits); // point to first half of data in fbRow.
Array1D backwardRow = fbRow.slice(hunits, hunits); // point to second half of data n fbRow.
// The following iteration merge the forward LSTM and the output layer // together.
c.clear(); // reuse c since it is the same size. for (int32_t i = 0; i < input_seq_len; i++) { #ifdef LSTM_DEBUG
printf("forwardRow %d\n", i);
forwardRow.print(); #endif// LSTM_DEBUG // Forward LSTM // Calculate the result into forwardRow, which point to the data in the first half // of fbRow.
compute(hunits,
fData->fForwardW, fData->fForwardU, fData->fForwardB,
fData->fEmbedding.row(indicesBuf[i]),
forwardRow, c, ifco);
// assign the data from hBackward.row(i) to second half of fbRowa.
backwardRow.assign(hBackward.row(i));
// current = argmax(logp)
LSTMClass current = static_cast<LSTMClass>(logp.maxIndex()); // BIES logic. if (current == BEGIN || current == SINGLE) { if (i != 0) {
foundBreaks.addElement(offsetsBuf[i], status); if (U_FAILURE(status)) return 0;
}
}
} return foundBreaks.size() - beginFoundBreakSize;
}
LSTMBreakEngine::LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status)
: DictionaryBreakEngine(), fData(data), fVectorizer(createVectorizer(fData, status))
{ if (U_FAILURE(status)) {
fData = nullptr; // If failure, we should not delete fData in destructor because the caller will do so. return;
}
setCharacters(set);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.