OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [classpath/] [gnu/] [java/] [text/] [WordBreakIterator.java] - Blame information for rev 769

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 769 jeremybenn
/* WordBreakIterator.java - Default word BreakIterator.
2
   Copyright (C) 1999, 2001, 2004 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version. */
37
 
38
 
39
package gnu.java.text;
40
 
41
import java.text.CharacterIterator;
42
 
43
/**
44
 * @author Tom Tromey <tromey@cygnus.com>
45
 * @date March 22, 1999
46
 * Written using The Unicode Standard, Version 2.0.
47
 */
48
 
49
public class WordBreakIterator extends BaseBreakIterator
50
{
51
  public Object clone ()
52
  {
53
    return new WordBreakIterator (this);
54
  }
55
 
56
  public WordBreakIterator ()
57
  {
58
  }
59
 
60
  private WordBreakIterator (WordBreakIterator other)
61
  {
62
    iter = (CharacterIterator) other.iter.clone();
63
  }
64
 
65
  // Some methods to tell us different properties of characters.
66
  private final boolean isHira (char c)
67
  {
68
    return c >= 0x3040 && c <= 0x309f;
69
  }
70
  private final boolean isKata (char c)
71
  {
72
    return c >= 0x30a0 && c <= 0x30ff;
73
  }
74
  private final boolean isHan (char c)
75
  {
76
    return c >= 0x4e00 && c <= 0x9fff;
77
  }
78
 
79
  public int next ()
80
  {
81
    int end = iter.getEndIndex();
82
    if (iter.getIndex() == end)
83
      return DONE;
84
 
85
    while (iter.getIndex() < end)
86
      {
87
        char c = iter.current();
88
        if (c == CharacterIterator.DONE)
89
          break;
90
        int type = Character.getType(c);
91
 
92
        char n = iter.next();
93
        if (n == CharacterIterator.DONE)
94
          break;
95
 
96
        // Break after paragraph separators.
97
        if (type == Character.PARAGRAPH_SEPARATOR
98
            || type == Character.LINE_SEPARATOR)
99
          break;
100
 
101
        // Break between letters and non-letters.
102
        // FIXME: we treat apostrophe as part of a word.  This
103
        // is an English-ism.
104
        boolean is_letter = Character.isLetter(c);
105
        if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
106
            && Character.isLetter(n))
107
          break;
108
 
109
        // Always break after certain symbols, such as punctuation.
110
        // This heuristic is derived from hints in the JCL book and is
111
        // not part of Unicode.  It seems to be right, however.
112
        // FIXME: we treat apostrophe as part of a word.  This
113
        // is an English-ism.
114
        if (c != '\''
115
            && (type == Character.DASH_PUNCTUATION
116
                || type == Character.START_PUNCTUATION
117
                || type == Character.END_PUNCTUATION
118
                || type == Character.CONNECTOR_PUNCTUATION
119
                || type == Character.OTHER_PUNCTUATION
120
                || type == Character.MATH_SYMBOL
121
                || type == Character.CURRENCY_SYMBOL
122
                || type == Character.MODIFIER_SYMBOL
123
                || type == Character.OTHER_SYMBOL
124
                || type == Character.FORMAT
125
                || type == Character.CONTROL))
126
          break;
127
 
128
        boolean is_hira = isHira (c);
129
        boolean is_kata = isKata (c);
130
        boolean is_han = isHan (c);
131
 
132
        // Special case Japanese.
133
        if (! is_hira && ! is_kata && ! is_han
134
            && type != Character.NON_SPACING_MARK
135
            && (isHira (n) || isKata (n) || isHan (n)))
136
          break;
137
 
138
        if (is_hira || is_kata || is_han || is_letter)
139
          {
140
            // Now we need to do some lookahead.  We might need to do
141
            // quite a bit of lookahead, so we save our position and
142
            // restore it later.
143
            int save = iter.getIndex();
144
            // Skip string of non spacing marks.
145
            while (n != CharacterIterator.DONE
146
                   && Character.getType(n) == Character.NON_SPACING_MARK)
147
              n = iter.next();
148
            if (n == CharacterIterator.DONE)
149
              break;
150
            if ((is_hira && ! isHira (n))
151
                || (is_kata && ! isHira (n) && ! isKata (n))
152
                || (is_han && ! isHira (n) && ! isHan (n))
153
                // FIXME: we treat apostrophe as part of a word.  This
154
                // is an English-ism.
155
                || (is_letter && ! Character.isLetter(n) && n != '\''))
156
              break;
157
            iter.setIndex(save);
158
          }
159
      }
160
 
161
    return iter.getIndex();
162
  }
163
 
164
  public int previous ()
165
  {
166
    int start = iter.getBeginIndex();
167
    if (iter.getIndex() == start)
168
      return DONE;
169
 
170
    while (iter.getIndex() >= start)
171
      {
172
        char c = iter.previous();
173
        if (c == CharacterIterator.DONE)
174
          break;
175
 
176
        boolean is_hira = isHira (c);
177
        boolean is_kata = isKata (c);
178
        boolean is_han = isHan (c);
179
        boolean is_letter = Character.isLetter(c);
180
 
181
        char n = iter.previous();
182
        if (n == CharacterIterator.DONE)
183
          break;
184
        iter.next();
185
        int type = Character.getType(n);
186
        // Break after paragraph separators.
187
        if (type == Character.PARAGRAPH_SEPARATOR
188
            || type == Character.LINE_SEPARATOR)
189
          break;
190
 
191
        // Break between letters and non-letters.
192
        // FIXME: we treat apostrophe as part of a word.  This
193
        // is an English-ism.
194
        if (n != '\'' && ! Character.isLetter(n)
195
            && type != Character.NON_SPACING_MARK
196
            && is_letter)
197
          break;
198
 
199
        // Always break after certain symbols, such as punctuation.
200
        // This heuristic is derived from hints in the JCL book and is
201
        // not part of Unicode.  It seems to be right, however.
202
        // FIXME: we treat apostrophe as part of a word.  This
203
        // is an English-ism.
204
        if (n != '\''
205
            && (type == Character.DASH_PUNCTUATION
206
                || type == Character.START_PUNCTUATION
207
                || type == Character.END_PUNCTUATION
208
                || type == Character.CONNECTOR_PUNCTUATION
209
                || type == Character.OTHER_PUNCTUATION
210
                || type == Character.MATH_SYMBOL
211
                || type == Character.CURRENCY_SYMBOL
212
                || type == Character.MODIFIER_SYMBOL
213
                || type == Character.OTHER_SYMBOL
214
                || type == Character.FORMAT
215
                || type == Character.CONTROL))
216
          break;
217
 
218
        // Special case Japanese.
219
        if ((is_hira || is_kata || is_han)
220
            && ! isHira (n) && ! isKata (n) && ! isHan (n)
221
            && type != Character.NON_SPACING_MARK)
222
          break;
223
 
224
        // We might have to skip over non spacing marks to see what's
225
        // on the other side.
226
        if (! is_hira || (! is_letter && c != '\''))
227
          {
228
            int save = iter.getIndex();
229
            while (n != CharacterIterator.DONE
230
                   && Character.getType(n) == Character.NON_SPACING_MARK)
231
              n = iter.previous();
232
            iter.setIndex(save);
233
            // This is a strange case: a bunch of non-spacing marks at
234
            // the beginning.  We treat the current location as a word
235
            // break.
236
            if (n == CharacterIterator.DONE)
237
              break;
238
            if ((isHira (n) && ! is_hira)
239
                || (isKata (n) && ! is_hira && ! is_kata)
240
                || (isHan (n) && ! is_hira && ! is_han)
241
                // FIXME: we treat apostrophe as part of a word.  This
242
                // is an English-ism.
243
                || (! is_letter && c != '\'' && Character.isLetter(n)))
244
              break;
245
          }
246
      }
247
 
248
    return iter.getIndex();
249
  }
250
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.