1 |
771 |
jeremybenn |
/* CollationElementIterator.java -- Walks through collation elements
|
2 |
|
|
Copyright (C) 1998, 1999, 2001, 2002, 2003, 2004 Free Software Foundation
|
3 |
|
|
|
4 |
|
|
This file is part of GNU Classpath.
|
5 |
|
|
|
6 |
|
|
GNU Classpath is free software; you can redistribute it and/or modify
|
7 |
|
|
it under the terms of the GNU General Public License as published by
|
8 |
|
|
the Free Software Foundation; either version 2, or (at your option)
|
9 |
|
|
any later version.
|
10 |
|
|
|
11 |
|
|
GNU Classpath is distributed in the hope that it will be useful, but
|
12 |
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
|
|
General Public License for more details.
|
15 |
|
|
|
16 |
|
|
You should have received a copy of the GNU General Public License
|
17 |
|
|
along with GNU Classpath; see the file COPYING. If not, write to the
|
18 |
|
|
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
19 |
|
|
02110-1301 USA.
|
20 |
|
|
|
21 |
|
|
Linking this library statically or dynamically with other modules is
|
22 |
|
|
making a combined work based on this library. Thus, the terms and
|
23 |
|
|
conditions of the GNU General Public License cover the whole
|
24 |
|
|
combination.
|
25 |
|
|
|
26 |
|
|
As a special exception, the copyright holders of this library give you
|
27 |
|
|
permission to link this library with independent modules to produce an
|
28 |
|
|
executable, regardless of the license terms of these independent
|
29 |
|
|
modules, and to copy and distribute the resulting executable under
|
30 |
|
|
terms of your choice, provided that you also meet, for each linked
|
31 |
|
|
independent module, the terms and conditions of the license of that
|
32 |
|
|
module. An independent module is a module which is not derived from
|
33 |
|
|
or based on this library. If you modify this library, you may extend
|
34 |
|
|
this exception to your version of the library, but you are not
|
35 |
|
|
obligated to do so. If you do not wish to do so, delete this
|
36 |
|
|
exception statement from your version. */
|
37 |
|
|
|
38 |
|
|
|
39 |
|
|
package java.text;
|
40 |
|
|
|
41 |
|
|
import gnu.java.lang.CPStringBuilder;
|
42 |
|
|
|
43 |
|
|
import java.util.ArrayList;
|
44 |
|
|
|
45 |
|
|
/* Written using "Java Class Libraries", 2nd edition, plus online
|
46 |
|
|
* API docs for JDK 1.2 from http://www.javasoft.com.
|
47 |
|
|
* Status: Believed complete and correct to JDK 1.1.
|
48 |
|
|
*/
|
49 |
|
|
|
50 |
|
|
/**
|
51 |
|
|
* This class walks through the character collation elements of a
|
52 |
|
|
* <code>String</code> as defined by the collation rules in an instance of
|
53 |
|
|
* <code>RuleBasedCollator</code>. There is no public constructor for
|
54 |
|
|
* this class. An instance is created by calling the
|
55 |
|
|
* <code>getCollationElementIterator</code> method on
|
56 |
|
|
* <code>RuleBasedCollator</code>.
|
57 |
|
|
*
|
58 |
|
|
* @author Aaron M. Renn (arenn@urbanophile.com)
|
59 |
|
|
* @author Tom Tromey (tromey@cygnus.com)
|
60 |
|
|
* @author Guilhem Lavaux (guilhem.lavaux@free.fr)
|
61 |
|
|
*/
|
62 |
|
|
public final class CollationElementIterator
|
63 |
|
|
{
|
64 |
|
|
/**
|
65 |
|
|
* This is a constant value that is returned to indicate that the end of
|
66 |
|
|
* the string was encountered.
|
67 |
|
|
*/
|
68 |
|
|
public static final int NULLORDER = -1;
|
69 |
|
|
|
70 |
|
|
/**
|
71 |
|
|
* This is the RuleBasedCollator this object was created from.
|
72 |
|
|
*/
|
73 |
|
|
RuleBasedCollator collator;
|
74 |
|
|
|
75 |
|
|
/**
|
76 |
|
|
* This is the String that is being iterated over.
|
77 |
|
|
*/
|
78 |
|
|
CharacterIterator text;
|
79 |
|
|
|
80 |
|
|
/**
|
81 |
|
|
* This is the index into the collation decomposition where we are currently scanning.
|
82 |
|
|
*/
|
83 |
|
|
int index;
|
84 |
|
|
|
85 |
|
|
/**
|
86 |
|
|
* This is the index into the String where we are currently scanning.
|
87 |
|
|
*/
|
88 |
|
|
int textIndex;
|
89 |
|
|
|
90 |
|
|
/**
|
91 |
|
|
* Array containing the collation decomposition of the
|
92 |
|
|
* text given to the constructor.
|
93 |
|
|
*/
|
94 |
|
|
private RuleBasedCollator.CollationElement[] text_decomposition;
|
95 |
|
|
|
96 |
|
|
/**
|
97 |
|
|
* Array containing the index of the specified block.
|
98 |
|
|
*/
|
99 |
|
|
private int[] text_indexes;
|
100 |
|
|
|
101 |
|
|
/**
|
102 |
|
|
* This method initializes a new instance of <code>CollationElementIterator</code>
|
103 |
|
|
* to iterate over the specified <code>String</code> using the rules in the
|
104 |
|
|
* specified <code>RuleBasedCollator</code>.
|
105 |
|
|
*
|
106 |
|
|
* @param collator The <code>RuleBasedCollation</code> used for calculating collation values
|
107 |
|
|
* @param text The <code>String</code> to iterate over.
|
108 |
|
|
*/
|
109 |
|
|
CollationElementIterator(RuleBasedCollator collator, String text)
|
110 |
|
|
{
|
111 |
|
|
this.collator = collator;
|
112 |
|
|
|
113 |
|
|
setText (text);
|
114 |
|
|
}
|
115 |
|
|
|
116 |
|
|
/**
|
117 |
|
|
* This method initializes a new instance of <code>CollationElementIterator</code>
|
118 |
|
|
* to iterate over the specified <code>String</code> using the rules in the
|
119 |
|
|
* specified <code>RuleBasedCollator</code>.
|
120 |
|
|
*
|
121 |
|
|
* @param collator The <code>RuleBasedCollation</code> used for calculating collation values
|
122 |
|
|
* @param text The character iterator to iterate over.
|
123 |
|
|
*/
|
124 |
|
|
CollationElementIterator(RuleBasedCollator collator, CharacterIterator text)
|
125 |
|
|
{
|
126 |
|
|
this.collator = collator;
|
127 |
|
|
|
128 |
|
|
setText (text);
|
129 |
|
|
}
|
130 |
|
|
|
131 |
|
|
RuleBasedCollator.CollationElement nextBlock()
|
132 |
|
|
{
|
133 |
|
|
if (index >= text_decomposition.length)
|
134 |
|
|
return null;
|
135 |
|
|
|
136 |
|
|
RuleBasedCollator.CollationElement e = text_decomposition[index];
|
137 |
|
|
|
138 |
|
|
textIndex = text_indexes[index+1];
|
139 |
|
|
|
140 |
|
|
index++;
|
141 |
|
|
|
142 |
|
|
return e;
|
143 |
|
|
}
|
144 |
|
|
|
145 |
|
|
RuleBasedCollator.CollationElement previousBlock()
|
146 |
|
|
{
|
147 |
|
|
if (index == 0)
|
148 |
|
|
return null;
|
149 |
|
|
|
150 |
|
|
index--;
|
151 |
|
|
RuleBasedCollator.CollationElement e = text_decomposition[index];
|
152 |
|
|
|
153 |
|
|
textIndex = text_indexes[index+1];
|
154 |
|
|
|
155 |
|
|
return e;
|
156 |
|
|
}
|
157 |
|
|
|
158 |
|
|
/**
|
159 |
|
|
* This method returns the collation ordering value of the next character sequence
|
160 |
|
|
* in the string (it may be an extended character following collation rules).
|
161 |
|
|
* This method will return <code>NULLORDER</code> if the
|
162 |
|
|
* end of the string was reached.
|
163 |
|
|
*
|
164 |
|
|
* @return The collation ordering value.
|
165 |
|
|
*/
|
166 |
|
|
public int next()
|
167 |
|
|
{
|
168 |
|
|
RuleBasedCollator.CollationElement e = nextBlock();
|
169 |
|
|
|
170 |
|
|
if (e == null)
|
171 |
|
|
return NULLORDER;
|
172 |
|
|
|
173 |
|
|
return e.getValue();
|
174 |
|
|
}
|
175 |
|
|
|
176 |
|
|
/**
|
177 |
|
|
* This method returns the collation ordering value of the previous character
|
178 |
|
|
* in the string. This method will return <code>NULLORDER</code> if the
|
179 |
|
|
* beginning of the string was reached.
|
180 |
|
|
*
|
181 |
|
|
* @return The collation ordering value.
|
182 |
|
|
*/
|
183 |
|
|
public int previous()
|
184 |
|
|
{
|
185 |
|
|
RuleBasedCollator.CollationElement e = previousBlock();
|
186 |
|
|
|
187 |
|
|
if (e == null)
|
188 |
|
|
return NULLORDER;
|
189 |
|
|
|
190 |
|
|
return e.getValue();
|
191 |
|
|
}
|
192 |
|
|
|
193 |
|
|
/**
|
194 |
|
|
* This method returns the primary order value for the given collation
|
195 |
|
|
* value.
|
196 |
|
|
*
|
197 |
|
|
* @param order The collation value returned from <code>next()</code> or
|
198 |
|
|
* <code>previous()</code>.
|
199 |
|
|
*
|
200 |
|
|
* @return The primary order value of the specified collation value. This is
|
201 |
|
|
* the high 16 bits.
|
202 |
|
|
*/
|
203 |
|
|
public static int primaryOrder(int order)
|
204 |
|
|
{
|
205 |
|
|
// From the JDK 1.2 spec.
|
206 |
|
|
return order >>> 16;
|
207 |
|
|
}
|
208 |
|
|
|
209 |
|
|
/**
|
210 |
|
|
* This method resets the internal position pointer to read from the
|
211 |
|
|
* beginning of the <code>String</code> again.
|
212 |
|
|
*/
|
213 |
|
|
public void reset()
|
214 |
|
|
{
|
215 |
|
|
index = 0;
|
216 |
|
|
textIndex = 0;
|
217 |
|
|
}
|
218 |
|
|
|
219 |
|
|
/**
|
220 |
|
|
* This method returns the secondary order value for the given collation
|
221 |
|
|
* value.
|
222 |
|
|
*
|
223 |
|
|
* @param order The collation value returned from <code>next()</code> or
|
224 |
|
|
* <code>previous()</code>.
|
225 |
|
|
*
|
226 |
|
|
* @return The secondary order value of the specified collation value. This
|
227 |
|
|
* is the bits 8-15.
|
228 |
|
|
*/
|
229 |
|
|
public static short secondaryOrder(int order)
|
230 |
|
|
{
|
231 |
|
|
// From the JDK 1.2 spec.
|
232 |
|
|
return (short) ((order >>> 8) & 255);
|
233 |
|
|
}
|
234 |
|
|
|
235 |
|
|
/**
|
236 |
|
|
* This method returns the tertiary order value for the given collation
|
237 |
|
|
* value.
|
238 |
|
|
*
|
239 |
|
|
* @param order The collation value returned from <code>next()</code> or
|
240 |
|
|
* <code>previous()</code>.
|
241 |
|
|
*
|
242 |
|
|
* @return The tertiary order value of the specified collation value. This
|
243 |
|
|
* is the low eight bits.
|
244 |
|
|
*/
|
245 |
|
|
public static short tertiaryOrder(int order)
|
246 |
|
|
{
|
247 |
|
|
// From the JDK 1.2 spec.
|
248 |
|
|
return (short) (order & 255);
|
249 |
|
|
}
|
250 |
|
|
|
251 |
|
|
/**
|
252 |
|
|
* This method sets the <code>String</code> that it is iterating over
|
253 |
|
|
* to the specified <code>String</code>.
|
254 |
|
|
*
|
255 |
|
|
* @param text The new <code>String</code> to iterate over.
|
256 |
|
|
*
|
257 |
|
|
* @since 1.2
|
258 |
|
|
*/
|
259 |
|
|
public void setText(String text)
|
260 |
|
|
{
|
261 |
|
|
int idx = 0;
|
262 |
|
|
int idx_idx = 0;
|
263 |
|
|
int alreadyExpanded = 0;
|
264 |
|
|
int idxToMove = 0;
|
265 |
|
|
|
266 |
|
|
this.text = new StringCharacterIterator(text);
|
267 |
|
|
this.index = 0;
|
268 |
|
|
|
269 |
|
|
String work_text = text.intern();
|
270 |
|
|
|
271 |
|
|
ArrayList a_element = new ArrayList();
|
272 |
|
|
ArrayList a_idx = new ArrayList();
|
273 |
|
|
|
274 |
|
|
// Build element collection ordered as they come in "text".
|
275 |
|
|
while (idx < work_text.length())
|
276 |
|
|
{
|
277 |
|
|
String key, key_old;
|
278 |
|
|
|
279 |
|
|
Object object = null;
|
280 |
|
|
int p = 1;
|
281 |
|
|
|
282 |
|
|
// IMPROVE: use a TreeMap with a prefix-ordering rule.
|
283 |
|
|
key_old = key = null;
|
284 |
|
|
do
|
285 |
|
|
{
|
286 |
|
|
if (object != null)
|
287 |
|
|
key_old = key;
|
288 |
|
|
key = work_text.substring (idx, idx+p);
|
289 |
|
|
object = collator.prefix_tree.get (key);
|
290 |
|
|
if (object != null && idx < alreadyExpanded)
|
291 |
|
|
{
|
292 |
|
|
RuleBasedCollator.CollationElement prefix = (RuleBasedCollator.CollationElement)object;
|
293 |
|
|
if (prefix.expansion != null &&
|
294 |
|
|
prefix.expansion.startsWith(work_text.substring(0, idx)))
|
295 |
|
|
{
|
296 |
|
|
object = null;
|
297 |
|
|
key = key_old;
|
298 |
|
|
}
|
299 |
|
|
}
|
300 |
|
|
p++;
|
301 |
|
|
}
|
302 |
|
|
while (idx+p <= work_text.length());
|
303 |
|
|
|
304 |
|
|
if (object == null)
|
305 |
|
|
key = key_old;
|
306 |
|
|
|
307 |
|
|
RuleBasedCollator.CollationElement prefix =
|
308 |
|
|
(RuleBasedCollator.CollationElement) collator.prefix_tree.get (key);
|
309 |
|
|
|
310 |
|
|
/*
|
311 |
|
|
* First case: There is no such sequence in the database.
|
312 |
|
|
* We will have to build one from the context.
|
313 |
|
|
*/
|
314 |
|
|
if (prefix == null)
|
315 |
|
|
{
|
316 |
|
|
/*
|
317 |
|
|
* We are dealing with sequences in an expansion. They
|
318 |
|
|
* are treated as accented characters (tertiary order).
|
319 |
|
|
*/
|
320 |
|
|
if (alreadyExpanded > 0)
|
321 |
|
|
{
|
322 |
|
|
RuleBasedCollator.CollationElement e =
|
323 |
|
|
collator.getDefaultAccentedElement (work_text.charAt (idx));
|
324 |
|
|
|
325 |
|
|
a_element.add (e);
|
326 |
|
|
a_idx.add (new Integer(idx_idx));
|
327 |
|
|
idx++;
|
328 |
|
|
alreadyExpanded--;
|
329 |
|
|
if (alreadyExpanded == 0)
|
330 |
|
|
{
|
331 |
|
|
/* There is not any characters left in the expansion set.
|
332 |
|
|
* We can increase the pointer in the source string.
|
333 |
|
|
*/
|
334 |
|
|
idx_idx += idxToMove;
|
335 |
|
|
idxToMove = 0;
|
336 |
|
|
}
|
337 |
|
|
else
|
338 |
|
|
idx_idx++;
|
339 |
|
|
}
|
340 |
|
|
else
|
341 |
|
|
{
|
342 |
|
|
/* This is a normal character. */
|
343 |
|
|
RuleBasedCollator.CollationElement e =
|
344 |
|
|
collator.getDefaultElement (work_text.charAt (idx));
|
345 |
|
|
Integer i_ref = new Integer(idx_idx);
|
346 |
|
|
|
347 |
|
|
/* Don't forget to mark it as a special sequence so the
|
348 |
|
|
* string can be ordered.
|
349 |
|
|
*/
|
350 |
|
|
a_element.add (RuleBasedCollator.SPECIAL_UNKNOWN_SEQ);
|
351 |
|
|
a_idx.add (i_ref);
|
352 |
|
|
a_element.add (e);
|
353 |
|
|
a_idx.add (i_ref);
|
354 |
|
|
idx_idx++;
|
355 |
|
|
idx++;
|
356 |
|
|
}
|
357 |
|
|
continue;
|
358 |
|
|
}
|
359 |
|
|
|
360 |
|
|
/*
|
361 |
|
|
* Second case: Here we have found a matching sequence.
|
362 |
|
|
* Here we have an expansion string prepend it to the "work text" and
|
363 |
|
|
* add the corresponding sorting element. We must also mark
|
364 |
|
|
*/
|
365 |
|
|
if (prefix.expansion != null)
|
366 |
|
|
{
|
367 |
|
|
work_text = prefix.expansion
|
368 |
|
|
+ work_text.substring (idx+prefix.key.length());
|
369 |
|
|
idx = 0;
|
370 |
|
|
a_element.add (prefix);
|
371 |
|
|
a_idx.add (new Integer(idx_idx));
|
372 |
|
|
if (alreadyExpanded == 0)
|
373 |
|
|
idxToMove = prefix.key.length();
|
374 |
|
|
alreadyExpanded += prefix.expansion.length()-prefix.key.length();
|
375 |
|
|
}
|
376 |
|
|
else
|
377 |
|
|
{
|
378 |
|
|
/* Third case: the simplest. We have got the prefix and it
|
379 |
|
|
* has not to be expanded.
|
380 |
|
|
*/
|
381 |
|
|
a_element.add (prefix);
|
382 |
|
|
a_idx.add (new Integer(idx_idx));
|
383 |
|
|
idx += prefix.key.length();
|
384 |
|
|
/* If the sequence is in an expansion, we must decrease the
|
385 |
|
|
* counter.
|
386 |
|
|
*/
|
387 |
|
|
if (alreadyExpanded > 0)
|
388 |
|
|
{
|
389 |
|
|
alreadyExpanded -= prefix.key.length();
|
390 |
|
|
if (alreadyExpanded == 0)
|
391 |
|
|
{
|
392 |
|
|
idx_idx += idxToMove;
|
393 |
|
|
idxToMove = 0;
|
394 |
|
|
}
|
395 |
|
|
}
|
396 |
|
|
else
|
397 |
|
|
idx_idx += prefix.key.length();
|
398 |
|
|
}
|
399 |
|
|
}
|
400 |
|
|
|
401 |
|
|
text_decomposition = (RuleBasedCollator.CollationElement[])
|
402 |
|
|
a_element.toArray(new RuleBasedCollator.CollationElement[a_element.size()]);
|
403 |
|
|
text_indexes = new int[a_idx.size()+1];
|
404 |
|
|
for (int i = 0; i < a_idx.size(); i++)
|
405 |
|
|
{
|
406 |
|
|
text_indexes[i] = ((Integer)a_idx.get(i)).intValue();
|
407 |
|
|
}
|
408 |
|
|
text_indexes[a_idx.size()] = text.length();
|
409 |
|
|
}
|
410 |
|
|
|
411 |
|
|
/**
|
412 |
|
|
* This method sets the <code>String</code> that it is iterating over
|
413 |
|
|
* to the <code>String</code> represented by the specified
|
414 |
|
|
* <code>CharacterIterator</code>.
|
415 |
|
|
*
|
416 |
|
|
* @param source The <code>CharacterIterator</code> containing the new
|
417 |
|
|
* <code>String</code> to iterate over.
|
418 |
|
|
*/
|
419 |
|
|
public void setText(CharacterIterator source)
|
420 |
|
|
{
|
421 |
|
|
CPStringBuilder expand = new CPStringBuilder();
|
422 |
|
|
|
423 |
|
|
// For now assume we read from the beginning of the string.
|
424 |
|
|
for (char c = source.first();
|
425 |
|
|
c != CharacterIterator.DONE;
|
426 |
|
|
c = source.next())
|
427 |
|
|
expand.append(c);
|
428 |
|
|
|
429 |
|
|
setText(expand.toString());
|
430 |
|
|
}
|
431 |
|
|
|
432 |
|
|
/**
|
433 |
|
|
* This method returns the current offset into the <code>String</code>
|
434 |
|
|
* that is being iterated over.
|
435 |
|
|
*
|
436 |
|
|
* @return The iteration index position.
|
437 |
|
|
*
|
438 |
|
|
* @since 1.2
|
439 |
|
|
*/
|
440 |
|
|
public int getOffset()
|
441 |
|
|
{
|
442 |
|
|
return textIndex;
|
443 |
|
|
}
|
444 |
|
|
|
445 |
|
|
/**
|
446 |
|
|
* This method sets the iteration index position into the current
|
447 |
|
|
* <code>String</code> to the specified value. This value must not
|
448 |
|
|
* be negative and must not be greater than the last index position
|
449 |
|
|
* in the <code>String</code>.
|
450 |
|
|
*
|
451 |
|
|
* @param offset The new iteration index position.
|
452 |
|
|
*
|
453 |
|
|
* @exception IllegalArgumentException If the new offset is not valid.
|
454 |
|
|
*/
|
455 |
|
|
public void setOffset(int offset)
|
456 |
|
|
{
|
457 |
|
|
if (offset < 0)
|
458 |
|
|
throw new IllegalArgumentException("Negative offset: " + offset);
|
459 |
|
|
|
460 |
|
|
if (offset > (text.getEndIndex() - 1))
|
461 |
|
|
throw new IllegalArgumentException("Offset too large: " + offset);
|
462 |
|
|
|
463 |
|
|
for (index = 0; index < text_decomposition.length; index++)
|
464 |
|
|
{
|
465 |
|
|
if (offset <= text_indexes[index])
|
466 |
|
|
break;
|
467 |
|
|
}
|
468 |
|
|
/*
|
469 |
|
|
* As text_indexes[0] == 0, we should not have to take care whether index is
|
470 |
|
|
* greater than 0. It is always.
|
471 |
|
|
*/
|
472 |
|
|
if (text_indexes[index] == offset)
|
473 |
|
|
textIndex = offset;
|
474 |
|
|
else
|
475 |
|
|
textIndex = text_indexes[index-1];
|
476 |
|
|
}
|
477 |
|
|
|
478 |
|
|
/**
|
479 |
|
|
* This method returns the maximum length of any expansion sequence that
|
480 |
|
|
* ends with the specified collation order value. (Whatever that means).
|
481 |
|
|
*
|
482 |
|
|
* @param value The collation order value
|
483 |
|
|
*
|
484 |
|
|
* @return The maximum length of an expansion sequence.
|
485 |
|
|
*/
|
486 |
|
|
public int getMaxExpansion(int value)
|
487 |
|
|
{
|
488 |
|
|
return 1;
|
489 |
|
|
}
|
490 |
|
|
}
|