1 |
721 |
jeremybenn |
/*
|
2 |
|
|
* Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
|
3 |
|
|
*
|
4 |
|
|
* THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
|
5 |
|
|
* OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
|
6 |
|
|
*
|
7 |
|
|
* Permission is hereby granted to use or copy this program
|
8 |
|
|
* for any purpose, provided the above notices are retained on all copies.
|
9 |
|
|
* Permission to modify the code and to distribute modified code is granted,
|
10 |
|
|
* provided the above notices are retained, and a notice that the code was
|
11 |
|
|
* modified is included with the above copyright notice.
|
12 |
|
|
*
|
13 |
|
|
* Author: Hans-J. Boehm (boehm@parc.xerox.com)
|
14 |
|
|
*/
|
15 |
|
|
/* Boehm, October 5, 1995 4:20 pm PDT */
|
16 |
|
|
|
17 |
|
|
/*
|
18 |
|
|
* Cords are immutable character strings. A number of operations
|
19 |
|
|
* on long cords are much more efficient than their strings.h counterpart.
|
20 |
|
|
* In particular, concatenation takes constant time independent of the length
|
21 |
|
|
* of the arguments. (Cords are represented as trees, with internal
|
22 |
|
|
* nodes representing concatenation and leaves consisting of either C
|
23 |
|
|
* strings or a functional description of the string.)
|
24 |
|
|
*
|
25 |
|
|
* The following are reasonable applications of cords. They would perform
|
26 |
|
|
* unacceptably if C strings were used:
|
27 |
|
|
* - A compiler that produces assembly language output by repeatedly
|
28 |
|
|
* concatenating instructions onto a cord representing the output file.
|
29 |
|
|
* - A text editor that converts the input file to a cord, and then
|
30 |
|
|
* performs editing operations by producing a new cord representing
|
31 |
|
|
* the file after echa character change (and keeping the old ones in an
|
32 |
|
|
* edit history)
|
33 |
|
|
*
|
34 |
|
|
* For optimal performance, cords should be built by
|
35 |
|
|
* concatenating short sections.
|
36 |
|
|
* This interface is designed for maximum compatibility with C strings.
|
37 |
|
|
* ASCII NUL characters may be embedded in cords using CORD_from_fn.
|
38 |
|
|
* This is handled correctly, but CORD_to_char_star will produce a string
|
39 |
|
|
* with embedded NULs when given such a cord.
|
40 |
|
|
*
|
41 |
|
|
* This interface is fairly big, largely for performance reasons.
|
42 |
|
|
* The most basic constants and functions:
|
43 |
|
|
*
|
44 |
|
|
* CORD - the type of a cord;
|
45 |
|
|
* CORD_EMPTY - empty cord;
|
46 |
|
|
* CORD_len(cord) - length of a cord;
|
47 |
|
|
* CORD_cat(cord1,cord2) - concatenation of two cords;
|
48 |
|
|
* CORD_substr(cord, start, len) - substring (or subcord);
|
49 |
|
|
* CORD_pos i; CORD_FOR(i, cord) { ... CORD_pos_fetch(i) ... } -
|
50 |
|
|
* examine each character in a cord. CORD_pos_fetch(i) is the char.
|
51 |
|
|
* CORD_fetch(int i) - Retrieve i'th character (slowly).
|
52 |
|
|
* CORD_cmp(cord1, cord2) - compare two cords.
|
53 |
|
|
* CORD_from_file(FILE * f) - turn a read-only file into a cord.
|
54 |
|
|
* CORD_to_char_star(cord) - convert to C string.
|
55 |
|
|
* (Non-NULL C constant strings are cords.)
|
56 |
|
|
* CORD_printf (etc.) - cord version of printf. Use %r for cords.
|
57 |
|
|
*/
|
58 |
|
|
# ifndef CORD_H
|
59 |
|
|
|
60 |
|
|
# define CORD_H
|
61 |
|
|
# include <stddef.h>
|
62 |
|
|
# include <stdio.h>
|
63 |
|
|
/* Cords have type const char *. This is cheating quite a bit, and not */
|
64 |
|
|
/* 100% portable. But it means that nonempty character string */
|
65 |
|
|
/* constants may be used as cords directly, provided the string is */
|
66 |
|
|
/* never modified in place. The empty cord is represented by, and */
|
67 |
|
|
/* can be written as, 0. */
|
68 |
|
|
|
69 |
|
|
typedef const char * CORD;
|
70 |
|
|
|
71 |
|
|
/* An empty cord is always represented as nil */
|
72 |
|
|
# define CORD_EMPTY 0
|
73 |
|
|
|
74 |
|
|
/* Is a nonempty cord represented as a C string? */
|
75 |
|
|
#define CORD_IS_STRING(s) (*(s) != '\0')
|
76 |
|
|
|
77 |
|
|
/* Concatenate two cords. If the arguments are C strings, they may */
|
78 |
|
|
/* not be subsequently altered. */
|
79 |
|
|
CORD CORD_cat(CORD x, CORD y);
|
80 |
|
|
|
81 |
|
|
/* Concatenate a cord and a C string with known length. Except for the */
|
82 |
|
|
/* empty string case, this is a special case of CORD_cat. Since the */
|
83 |
|
|
/* length is known, it can be faster. */
|
84 |
|
|
/* The string y is shared with the resulting CORD. Hence it should */
|
85 |
|
|
/* not be altered by the caller. */
|
86 |
|
|
CORD CORD_cat_char_star(CORD x, const char * y, size_t leny);
|
87 |
|
|
|
88 |
|
|
/* Compute the length of a cord */
|
89 |
|
|
size_t CORD_len(CORD x);
|
90 |
|
|
|
91 |
|
|
/* Cords may be represented by functions defining the ith character */
|
92 |
|
|
typedef char (* CORD_fn)(size_t i, void * client_data);
|
93 |
|
|
|
94 |
|
|
/* Turn a functional description into a cord. */
|
95 |
|
|
CORD CORD_from_fn(CORD_fn fn, void * client_data, size_t len);
|
96 |
|
|
|
97 |
|
|
/* Return the substring (subcord really) of x with length at most n, */
|
98 |
|
|
/* starting at position i. (The initial character has position 0.) */
|
99 |
|
|
CORD CORD_substr(CORD x, size_t i, size_t n);
|
100 |
|
|
|
101 |
|
|
/* Return the argument, but rebalanced to allow more efficient */
|
102 |
|
|
/* character retrieval, substring operations, and comparisons. */
|
103 |
|
|
/* This is useful only for cords that were built using repeated */
|
104 |
|
|
/* concatenation. Guarantees log time access to the result, unless */
|
105 |
|
|
/* x was obtained through a large number of repeated substring ops */
|
106 |
|
|
/* or the embedded functional descriptions take longer to evaluate. */
|
107 |
|
|
/* May reallocate significant parts of the cord. The argument is not */
|
108 |
|
|
/* modified; only the result is balanced. */
|
109 |
|
|
CORD CORD_balance(CORD x);
|
110 |
|
|
|
111 |
|
|
/* The following traverse a cord by applying a function to each */
|
112 |
|
|
/* character. This is occasionally appropriate, especially where */
|
113 |
|
|
/* speed is crucial. But, since C doesn't have nested functions, */
|
114 |
|
|
/* clients of this sort of traversal are clumsy to write. Consider */
|
115 |
|
|
/* the functions that operate on cord positions instead. */
|
116 |
|
|
|
117 |
|
|
/* Function to iteratively apply to individual characters in cord. */
|
118 |
|
|
typedef int (* CORD_iter_fn)(char c, void * client_data);
|
119 |
|
|
|
120 |
|
|
/* Function to apply to substrings of a cord. Each substring is a */
|
121 |
|
|
/* a C character string, not a general cord. */
|
122 |
|
|
typedef int (* CORD_batched_iter_fn)(const char * s, void * client_data);
|
123 |
|
|
# define CORD_NO_FN ((CORD_batched_iter_fn)0)
|
124 |
|
|
|
125 |
|
|
/* Apply f1 to each character in the cord, in ascending order, */
|
126 |
|
|
/* starting at position i. If */
|
127 |
|
|
/* f2 is not CORD_NO_FN, then multiple calls to f1 may be replaced by */
|
128 |
|
|
/* a single call to f2. The parameter f2 is provided only to allow */
|
129 |
|
|
/* some optimization by the client. This terminates when the right */
|
130 |
|
|
/* end of this string is reached, or when f1 or f2 return != 0. In the */
|
131 |
|
|
/* latter case CORD_iter returns != 0. Otherwise it returns 0. */
|
132 |
|
|
/* The specified value of i must be < CORD_len(x). */
|
133 |
|
|
int CORD_iter5(CORD x, size_t i, CORD_iter_fn f1,
|
134 |
|
|
CORD_batched_iter_fn f2, void * client_data);
|
135 |
|
|
|
136 |
|
|
/* A simpler version that starts at 0, and without f2: */
|
137 |
|
|
int CORD_iter(CORD x, CORD_iter_fn f1, void * client_data);
|
138 |
|
|
# define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)
|
139 |
|
|
|
140 |
|
|
/* Similar to CORD_iter5, but end-to-beginning. No provisions for */
|
141 |
|
|
/* CORD_batched_iter_fn. */
|
142 |
|
|
int CORD_riter4(CORD x, size_t i, CORD_iter_fn f1, void * client_data);
|
143 |
|
|
|
144 |
|
|
/* A simpler version that starts at the end: */
|
145 |
|
|
int CORD_riter(CORD x, CORD_iter_fn f1, void * client_data);
|
146 |
|
|
|
147 |
|
|
/* Functions that operate on cord positions. The easy way to traverse */
|
148 |
|
|
/* cords. A cord position is logically a pair consisting of a cord */
|
149 |
|
|
/* and an index into that cord. But it is much faster to retrieve a */
|
150 |
|
|
/* charcter based on a position than on an index. Unfortunately, */
|
151 |
|
|
/* positions are big (order of a few 100 bytes), so allocate them with */
|
152 |
|
|
/* caution. */
|
153 |
|
|
/* Things in cord_pos.h should be treated as opaque, except as */
|
154 |
|
|
/* described below. Also note that */
|
155 |
|
|
/* CORD_pos_fetch, CORD_next and CORD_prev have both macro and function */
|
156 |
|
|
/* definitions. The former may evaluate their argument more than once. */
|
157 |
|
|
# include "private/cord_pos.h"
|
158 |
|
|
|
159 |
|
|
/*
|
160 |
|
|
Visible definitions from above:
|
161 |
|
|
|
162 |
|
|
typedef <OPAQUE but fairly big> CORD_pos[1];
|
163 |
|
|
|
164 |
|
|
* Extract the cord from a position:
|
165 |
|
|
CORD CORD_pos_to_cord(CORD_pos p);
|
166 |
|
|
|
167 |
|
|
* Extract the current index from a position:
|
168 |
|
|
size_t CORD_pos_to_index(CORD_pos p);
|
169 |
|
|
|
170 |
|
|
* Fetch the character located at the given position:
|
171 |
|
|
char CORD_pos_fetch(CORD_pos p);
|
172 |
|
|
|
173 |
|
|
* Initialize the position to refer to the given cord and index.
|
174 |
|
|
* Note that this is the most expensive function on positions:
|
175 |
|
|
void CORD_set_pos(CORD_pos p, CORD x, size_t i);
|
176 |
|
|
|
177 |
|
|
* Advance the position to the next character.
|
178 |
|
|
* P must be initialized and valid.
|
179 |
|
|
* Invalidates p if past end:
|
180 |
|
|
void CORD_next(CORD_pos p);
|
181 |
|
|
|
182 |
|
|
* Move the position to the preceding character.
|
183 |
|
|
* P must be initialized and valid.
|
184 |
|
|
* Invalidates p if past beginning:
|
185 |
|
|
void CORD_prev(CORD_pos p);
|
186 |
|
|
|
187 |
|
|
* Is the position valid, i.e. inside the cord?
|
188 |
|
|
int CORD_pos_valid(CORD_pos p);
|
189 |
|
|
*/
|
190 |
|
|
# define CORD_FOR(pos, cord) \
|
191 |
|
|
for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))
|
192 |
|
|
|
193 |
|
|
|
194 |
|
|
/* An out of memory handler to call. May be supplied by client. */
|
195 |
|
|
/* Must not return. */
|
196 |
|
|
extern void (* CORD_oom_fn)(void);
|
197 |
|
|
|
198 |
|
|
/* Dump the representation of x to stdout in an implementation defined */
|
199 |
|
|
/* manner. Intended for debugging only. */
|
200 |
|
|
void CORD_dump(CORD x);
|
201 |
|
|
|
202 |
|
|
/* The following could easily be implemented by the client. They are */
|
203 |
|
|
/* provided in cordxtra.c for convenience. */
|
204 |
|
|
|
205 |
|
|
/* Concatenate a character to the end of a cord. */
|
206 |
|
|
CORD CORD_cat_char(CORD x, char c);
|
207 |
|
|
|
208 |
|
|
/* Concatenate n cords. */
|
209 |
|
|
CORD CORD_catn(int n, /* CORD */ ...);
|
210 |
|
|
|
211 |
|
|
/* Return the character in CORD_substr(x, i, 1) */
|
212 |
|
|
char CORD_fetch(CORD x, size_t i);
|
213 |
|
|
|
214 |
|
|
/* Return < 0, 0, or > 0, depending on whether x < y, x = y, x > y */
|
215 |
|
|
int CORD_cmp(CORD x, CORD y);
|
216 |
|
|
|
217 |
|
|
/* A generalization that takes both starting positions for the */
|
218 |
|
|
/* comparison, and a limit on the number of characters to be compared. */
|
219 |
|
|
int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len);
|
220 |
|
|
|
221 |
|
|
/* Find the first occurrence of s in x at position start or later. */
|
222 |
|
|
/* Return the position of the first character of s in x, or */
|
223 |
|
|
/* CORD_NOT_FOUND if there is none. */
|
224 |
|
|
size_t CORD_str(CORD x, size_t start, CORD s);
|
225 |
|
|
|
226 |
|
|
/* Return a cord consisting of i copies of (possibly NUL) c. Dangerous */
|
227 |
|
|
/* in conjunction with CORD_to_char_star. */
|
228 |
|
|
/* The resulting representation takes constant space, independent of i. */
|
229 |
|
|
CORD CORD_chars(char c, size_t i);
|
230 |
|
|
# define CORD_nul(i) CORD_chars('\0', (i))
|
231 |
|
|
|
232 |
|
|
/* Turn a file into cord. The file must be seekable. Its contents */
|
233 |
|
|
/* must remain constant. The file may be accessed as an immediate */
|
234 |
|
|
/* result of this call and/or as a result of subsequent accesses to */
|
235 |
|
|
/* the cord. Short files are likely to be immediately read, but */
|
236 |
|
|
/* long files are likely to be read on demand, possibly relying on */
|
237 |
|
|
/* stdio for buffering. */
|
238 |
|
|
/* We must have exclusive access to the descriptor f, i.e. we may */
|
239 |
|
|
/* read it at any time, and expect the file pointer to be */
|
240 |
|
|
/* where we left it. Normally this should be invoked as */
|
241 |
|
|
/* CORD_from_file(fopen(...)) */
|
242 |
|
|
/* CORD_from_file arranges to close the file descriptor when it is no */
|
243 |
|
|
/* longer needed (e.g. when the result becomes inaccessible). */
|
244 |
|
|
/* The file f must be such that ftell reflects the actual character */
|
245 |
|
|
/* position in the file, i.e. the number of characters that can be */
|
246 |
|
|
/* or were read with fread. On UNIX systems this is always true. On */
|
247 |
|
|
/* MS Windows systems, f must be opened in binary mode. */
|
248 |
|
|
CORD CORD_from_file(FILE * f);
|
249 |
|
|
|
250 |
|
|
/* Equivalent to the above, except that the entire file will be read */
|
251 |
|
|
/* and the file pointer will be closed immediately. */
|
252 |
|
|
/* The binary mode restriction from above does not apply. */
|
253 |
|
|
CORD CORD_from_file_eager(FILE * f);
|
254 |
|
|
|
255 |
|
|
/* Equivalent to the above, except that the file will be read on demand.*/
|
256 |
|
|
/* The binary mode restriction applies. */
|
257 |
|
|
CORD CORD_from_file_lazy(FILE * f);
|
258 |
|
|
|
259 |
|
|
/* Turn a cord into a C string. The result shares no structure with */
|
260 |
|
|
/* x, and is thus modifiable. */
|
261 |
|
|
char * CORD_to_char_star(CORD x);
|
262 |
|
|
|
263 |
|
|
/* Turn a C string into a CORD. The C string is copied, and so may */
|
264 |
|
|
/* subsequently be modified. */
|
265 |
|
|
CORD CORD_from_char_star(const char *s);
|
266 |
|
|
|
267 |
|
|
/* Identical to the above, but the result may share structure with */
|
268 |
|
|
/* the argument and is thus not modifiable. */
|
269 |
|
|
const char * CORD_to_const_char_star(CORD x);
|
270 |
|
|
|
271 |
|
|
/* Write a cord to a file, starting at the current position. No */
|
272 |
|
|
/* trailing NULs are newlines are added. */
|
273 |
|
|
/* Returns EOF if a write error occurs, 1 otherwise. */
|
274 |
|
|
int CORD_put(CORD x, FILE * f);
|
275 |
|
|
|
276 |
|
|
/* "Not found" result for the following two functions. */
|
277 |
|
|
# define CORD_NOT_FOUND ((size_t)(-1))
|
278 |
|
|
|
279 |
|
|
/* A vague analog of strchr. Returns the position (an integer, not */
|
280 |
|
|
/* a pointer) of the first occurrence of (char) c inside x at position */
|
281 |
|
|
/* i or later. The value i must be < CORD_len(x). */
|
282 |
|
|
size_t CORD_chr(CORD x, size_t i, int c);
|
283 |
|
|
|
284 |
|
|
/* A vague analog of strrchr. Returns index of the last occurrence */
|
285 |
|
|
/* of (char) c inside x at position i or earlier. The value i */
|
286 |
|
|
/* must be < CORD_len(x). */
|
287 |
|
|
size_t CORD_rchr(CORD x, size_t i, int c);
|
288 |
|
|
|
289 |
|
|
|
290 |
|
|
/* The following are also not primitive, but are implemented in */
|
291 |
|
|
/* cordprnt.c. They provide functionality similar to the ANSI C */
|
292 |
|
|
/* functions with corresponding names, but with the following */
|
293 |
|
|
/* additions and changes: */
|
294 |
|
|
/* 1. A %r conversion specification specifies a CORD argument. Field */
|
295 |
|
|
/* width, precision, etc. have the same semantics as for %s. */
|
296 |
|
|
/* (Note that %c,%C, and %S were already taken.) */
|
297 |
|
|
/* 2. The format string is represented as a CORD. */
|
298 |
|
|
/* 3. CORD_sprintf and CORD_vsprintf assign the result through the 1st */ /* argument. Unlike their ANSI C versions, there is no need to guess */
|
299 |
|
|
/* the correct buffer size. */
|
300 |
|
|
/* 4. Most of the conversions are implement through the native */
|
301 |
|
|
/* vsprintf. Hence they are usually no faster, and */
|
302 |
|
|
/* idiosyncracies of the native printf are preserved. However, */
|
303 |
|
|
/* CORD arguments to CORD_sprintf and CORD_vsprintf are NOT copied; */
|
304 |
|
|
/* the result shares the original structure. This may make them */
|
305 |
|
|
/* very efficient in some unusual applications. */
|
306 |
|
|
/* The format string is copied. */
|
307 |
|
|
/* All functions return the number of characters generated or -1 on */
|
308 |
|
|
/* error. This complies with the ANSI standard, but is inconsistent */
|
309 |
|
|
/* with some older implementations of sprintf. */
|
310 |
|
|
|
311 |
|
|
/* The implementation of these is probably less portable than the rest */
|
312 |
|
|
/* of this package. */
|
313 |
|
|
|
314 |
|
|
#ifndef CORD_NO_IO
|
315 |
|
|
|
316 |
|
|
#include <stdarg.h>
|
317 |
|
|
|
318 |
|
|
int CORD_sprintf(CORD * out, CORD format, ...);
|
319 |
|
|
int CORD_vsprintf(CORD * out, CORD format, va_list args);
|
320 |
|
|
int CORD_fprintf(FILE * f, CORD format, ...);
|
321 |
|
|
int CORD_vfprintf(FILE * f, CORD format, va_list args);
|
322 |
|
|
int CORD_printf(CORD format, ...);
|
323 |
|
|
int CORD_vprintf(CORD format, va_list args);
|
324 |
|
|
|
325 |
|
|
#endif /* CORD_NO_IO */
|
326 |
|
|
|
327 |
|
|
# endif /* CORD_H */
|