TDME2  1.9.200
UTF8CharacterIterator.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <string>
4 #include <vector>
5 
6 #include <tdme/tdme.h>
7 #include <tdme/math/Math.h>
10 
11 using std::string;
12 using std::to_string;
13 using std::vector;
14 
15 using tdme::math::Math;
16 
18 
19 /**
20  * UTF8 string character iterator
21  */
23 public:
24 
25  /**
26  * UTF8 cache entry
27  */
29  friend class UTF8CharacterIterator;
30  public:
31  static constexpr int CACHE_ENTRY_SIZE { 100 };
32 
33  /**
34  * Remove cache
35  */
36  inline void removeCache() {
37  binaryCache.clear();
38  characterCache.clear();
39  }
40 
41  /**
42  * Remove from cache by binary index
43  * @param idx binary index
44  */
45  inline void removeCache(int binaryIdx, int characterIdx) {
46  // Console::println("MutableString::removeCache(): binary: " + to_string(binaryIdx) + ", character: " + to_string(characterIdx));
47  // remove succeeding entries from binary cache
49  auto& _cache = binaryCache;
50  auto removeFromCacheEntryIdx = (binaryIdx / UTF8CharacterIterator::UTF8PositionCache::CACHE_ENTRY_SIZE) - 1;
51  // Console::println("\tRemoving binary: " + to_string(removeFromCacheEntryIdx) + " / " + to_string(_cache.size() - 1) + " = " + to_string((removeFromCacheEntryIdx + 1) * UTF8CharacterIterator::UTF8PositionCache::CACHE_ENTRY_SIZE));
52  if (removeFromCacheEntryIdx < _cache.size()) {
53  _cache.erase(_cache.begin() + removeFromCacheEntryIdx, _cache.end());
54  }
55  } else {
56  binaryCache.clear();
57  }
58  // remove succeeding entries from character position cache
60  auto& _cache = characterCache;
61  auto removeFromCacheEntryIdx = (characterIdx / UTF8CharacterIterator::UTF8PositionCache::CACHE_ENTRY_SIZE) - 1;
62  // Console::println("\tRemoving character: " + to_string(removeFromCacheEntryIdx) + " / " + to_string(_cache.size() - 1) + " = " + to_string((removeFromCacheEntryIdx + 1) * UTF8CharacterIterator::UTF8PositionCache::CACHE_ENTRY_SIZE));
63  if (removeFromCacheEntryIdx < _cache.size()) {
64  _cache.erase(_cache.begin() + removeFromCacheEntryIdx, _cache.end());
65  }
66  } else {
67  characterCache.clear();
68  }
69  }
70 
71  private:
74  int binaryPosition,
76  ):
79  {}
82  };
83  vector<UTF8PositionCacheEntry> binaryCache;
84  vector<UTF8PositionCacheEntry> characterCache;
85  };
86 
87  // forbid class copy
89 
90  /**
91  * Public constructor
92  * @param stringReference string reference
93  * @param cache UTF8 position cache or nullptr if UTF8 positions should not be cached
94  */
96  //
97  }
98 
99  /**
100  * Reset
101  */
102  inline void reset() const {
103  binaryPosition = 0;
104  characterPosition = 0;
105  }
106 
107  /**
108  * @return underlying binary buffer position
109  */
110  inline int getBinaryPosition() const {
111  return binaryPosition;
112  }
113 
114  /**
115  * Set underlying binary buffer position
116  * @param position underlying buffer position
117  */
118  inline void seekBinaryPosition(int position) const {
119  reset();
120  // seeking in cache first
121  if (position >= UTF8PositionCache::CACHE_ENTRY_SIZE && cache != nullptr && cache->binaryCache.empty() == false) {
122  auto cacheIdx = Math::min((position / UTF8PositionCache::CACHE_ENTRY_SIZE) - 1, cache->binaryCache.size() - 1);
123  const auto& cacheEntry = cache->binaryCache[cacheIdx];
124  binaryPosition = cacheEntry.binaryPosition;
125  characterPosition = cacheEntry.characterPosition;
126  }
127  //
128  while (hasNext() == true && binaryPosition < position) {
129  if (hasNext() == true) next();
130  }
131  }
132 
133  /**
134  * @return character position
135  */
136  inline int getCharacterPosition() const {
137  return characterPosition;
138  }
139 
140  /**
141  * Seek character position
142  * @param position character position
143  */
144  inline void seekCharacterPosition(int position) const {
145  reset();
146  // seeking in cache first
147  if (position >= UTF8PositionCache::CACHE_ENTRY_SIZE && cache != nullptr && cache->characterCache.empty() == false) {
148  auto cacheIdx = Math::min((position / UTF8PositionCache::CACHE_ENTRY_SIZE) - 1, cache->characterCache.size() - 1);
149  const auto& cacheEntry = cache->characterCache[cacheIdx];
150  binaryPosition = cacheEntry.binaryPosition;
151  characterPosition = cacheEntry.characterPosition;
152  }
153  //
154  auto seekCount = position - characterPosition;
155  for (auto i = 0; i < seekCount; i++) {
156  if (hasNext() == true) next();
157  }
158  }
159 
160  /**
161  * @return next character available
162  */
163  inline bool hasNext() const {
164  return binaryPosition < stringReference.size();
165  }
166  /**
167  * @return next character or -1 if an error occurred or no string left
168  */
169  inline int next() const {
170  // see: http://www.zedwood.com/article/cpp-utf8-char-to-codepoint
171  int l = stringReference.size() - binaryPosition;
172  if (l < 1) return -1;
173  unsigned char u0 = stringReference[binaryPosition + 0];
174  if (u0 >= 0 && u0 <= 127) {
175  addCacheEntry();
176  binaryPosition++;
178  return u0;
179  }
180  if (l < 2) {
181  addCacheEntry();
182  binaryPosition++;
184  return -1;
185  }
186  unsigned char u1 = stringReference[binaryPosition + 1];
187  if (u0 >= 192 && u0 <= 223) {
188  addCacheEntry();
189  binaryPosition+= 2;
191  return (u0 - 192) * 64 + (u1 - 128);
192  }
193  if (u0 == 0xed && (u1 & 0xa0) == 0xa0) {
194  addCacheEntry();
195  binaryPosition+= 2;
197  return -1; // code points, 0xd800 to 0xdfff
198  }
199  if (l < 3) {
200  addCacheEntry();
201  binaryPosition+= 2;
203  return -1;
204  }
205  unsigned char u2 = stringReference[binaryPosition + 2];
206  if (u0 >= 224 && u0 <= 239) {
207  addCacheEntry();
208  binaryPosition+= 3;
210  return (u0 - 224) * 4096 + (u1 - 128) * 64 + (u2 - 128);
211  }
212  if (l < 4) {
213  addCacheEntry();
214  binaryPosition+= 3;
216  return -1;
217  }
218  unsigned char u3 = stringReference[binaryPosition + 3];
219  if (u0 >= 240 && u0 <= 247) {
220  addCacheEntry();
221  binaryPosition+= 4;
223  return (u0 - 240) * 262144 + (u1 - 128) * 4096 + (u2 - 128) * 64 + (u3 - 128);
224  }
225  //
226  addCacheEntry();
227  binaryPosition+= 4;
229  //
230  return -1;
231  }
232 
233 private:
234  const string& stringReference;
235  mutable int binaryPosition { 0 };
236  mutable int characterPosition { 0 };
238 
239  /**
240  * Add cache entry
241  */
242  inline void addCacheEntry() const {
243  // store every UTF8PositionCache::CACHE_ENTRY_SIZE character position, if not yet done
244  if (cache != nullptr) {
245  // binary cache
246  {
247  auto& _cache = cache->binaryCache;
248  if (binaryPosition > 0 && (binaryPosition % UTF8PositionCache::CACHE_ENTRY_SIZE) == 0 && (_cache.empty() == true || _cache[_cache.size() - 1].binaryPosition < binaryPosition)) {
249  _cache.emplace_back(
252  );
253  /*
254  Console::println("UTF8CharacterIterator::addCacheEntry(): binary cache: binary: " + to_string(binaryPosition) + " / character: " + to_string(characterPosition));
255  for (const auto& cacheEntry: _cache) {
256  Console::println("\tbinary cache: binary: " + to_string(cacheEntry.binaryPosition) + " / character: " + to_string(cacheEntry.characterPosition));
257  }
258  */
259  }
260  }
261  // character cache
262  {
263  auto& _cache = cache->characterCache;
264  if (characterPosition > 0 && (characterPosition % UTF8PositionCache::CACHE_ENTRY_SIZE) == 0 && (_cache.empty() == true || _cache[_cache.size() - 1].characterPosition < characterPosition)) {
265  // Console::println("UTF8CharacterIterator::addCacheEntry(): character cache: binary: " + to_string(binaryPosition) + " / character: " + to_string(characterPosition));
266  _cache.emplace_back(
269  );
270  /*
271  for (const auto& cacheEntry: _cache) {
272  Console::println("\tcharacter cache: binary: " + to_string(cacheEntry.binaryPosition) + " / character: " + to_string(cacheEntry.characterPosition));
273  }
274  */
275  }
276  }
277  }
278  }
279 
280 };
Standard math functions.
Definition: Math.h:19
Console class.
Definition: Console.h:29
void removeCache(int binaryIdx, int characterIdx)
Remove from cache by binary index.
UTF8 string character iterator.
void seekBinaryPosition(int position) const
Set underlying binary buffer position.
void addCacheEntry() const
Add cache entry.
void seekCharacterPosition(int position) const
Seek character position.
#define FORBID_CLASS_COPY(CLASS)
Definition: tdme.h:6