OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libjava/] [classpath/] [gnu/] [xml/] [pipeline/] [LinkFilter.java] - Blame information for rev 769

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 769 jeremybenn
/* LinkFilter.java --
2
   Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
3
 
4
This file is part of GNU Classpath.
5
 
6
GNU Classpath is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2, or (at your option)
9
any later version.
10
 
11
GNU Classpath is distributed in the hope that it will be useful, but
12
WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
General Public License for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GNU Classpath; see the file COPYING.  If not, write to the
18
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
02110-1301 USA.
20
 
21
Linking this library statically or dynamically with other modules is
22
making a combined work based on this library.  Thus, the terms and
23
conditions of the GNU General Public License cover the whole
24
combination.
25
 
26
As a special exception, the copyright holders of this library give you
27
permission to link this library with independent modules to produce an
28
executable, regardless of the license terms of these independent
29
modules, and to copy and distribute the resulting executable under
30
terms of your choice, provided that you also meet, for each linked
31
independent module, the terms and conditions of the license of that
32
module.  An independent module is a module which is not derived from
33
or based on this library.  If you modify this library, you may extend
34
this exception to your version of the library, but you are not
35
obligated to do so.  If you do not wish to do so, delete this
36
exception statement from your version. */
37
 
38
package gnu.xml.pipeline;
39
 
40
import java.io.IOException;
41
import java.net.URL;
42
import java.util.Enumeration;
43
import java.util.Vector;
44
 
45
import org.xml.sax.Attributes;
46
import org.xml.sax.SAXException;
47
 
48
 
49
/**
50
 * Pipeline filter to remember XHTML links found in a document,
51
 * so they can later be crawled.  Fragments are not counted, and duplicates
52
 * are ignored.  Callers are responsible for filtering out URLs they aren't
53
 * interested in.  Events are passed through unmodified.
54
 *
55
 * <p> Input MUST include a setDocumentLocator() call, as it's used to
56
 * resolve relative links in the absence of a "base" element.  Input MUST
57
 * also include namespace identifiers, since it is the XHTML namespace
58
 * identifier which is used to identify the relevant elements.
59
 *
60
 * <p><em>FIXME:</em> handle xml:base attribute ... in association with
61
 * a stack of base URIs.  Similarly, recognize/support XLink data.
62
 *
63
 * @author David Brownell
64
 */
65
public class LinkFilter extends EventFilter
66
{
67
    // for storing URIs
68
    private Vector              vector = new Vector ();
69
 
70
        // struct for "full" link record (tbd)
71
        // these for troubleshooting original source:
72
        //      original uri
73
        //      uri as resolved (base, relative, etc)
74
        //      URI of originating doc
75
        //      line #
76
        //      original element + attrs (img src, desc, etc)
77
 
78
        // XLink model of the link ... for inter-site pairups ?
79
 
80
    private String              baseURI;
81
 
82
    private boolean             siteRestricted = false;
83
 
84
    //
85
    // XXX leverage blacklist info (like robots.txt)
86
    //
87
    // XXX constructor w/param ... pipeline for sending link data
88
    // probably XHTML --> XLink, providing info as sketched above
89
    //
90
 
91
 
92
    /**
93
     * Constructs a new event filter, which collects links in private data
94
     * structure for later enumeration.
95
     */
96
        // constructor used by PipelineFactory
97
    public LinkFilter ()
98
    {
99
        super.setContentHandler (this);
100
    }
101
 
102
 
103
    /**
104
     * Constructs a new event filter, which collects links in private data
105
     * structure for later enumeration and passes all events, unmodified,
106
     * to the next consumer.
107
     */
108
        // constructor used by PipelineFactory
109
    public LinkFilter (EventConsumer next)
110
    {
111
        super (next);
112
        super.setContentHandler (this);
113
    }
114
 
115
 
116
    /**
117
     * Returns an enumeration of the links found since the filter
118
     * was constructed, or since removeAllLinks() was called.
119
     *
120
     * @return enumeration of strings.
121
     */
122
    public Enumeration getLinks ()
123
    {
124
        return vector.elements ();
125
    }
126
 
127
    /**
128
     * Removes records about all links reported to the event
129
     * stream, as if the filter were newly created.
130
     */
131
    public void removeAllLinks ()
132
    {
133
        vector = new Vector ();
134
    }
135
 
136
 
137
    /**
138
     * Collects URIs for (X)HTML content from elements which hold them.
139
     */
140
    public void startElement (
141
        String          uri,
142
        String          localName,
143
        String          qName,
144
        Attributes      atts
145
    ) throws SAXException
146
    {
147
        String  link;
148
 
149
        // Recognize XHTML links.
150
        if ("http://www.w3.org/1999/xhtml".equals (uri)) {
151
 
152
            if ("a".equals (localName) || "base".equals (localName)
153
                    || "area".equals (localName))
154
                link = atts.getValue ("href");
155
            else if ("iframe".equals (localName) || "frame".equals (localName))
156
                link = atts.getValue ("src");
157
            else if ("blockquote".equals (localName) || "q".equals (localName)
158
                    || "ins".equals (localName) || "del".equals (localName))
159
                link = atts.getValue ("cite");
160
            else
161
                link = null;
162
            link = maybeAddLink (link);
163
 
164
            // "base" modifies designated baseURI
165
            if ("base".equals (localName) && link != null)
166
                baseURI = link;
167
 
168
            if ("iframe".equals (localName) || "img".equals (localName))
169
                maybeAddLink (atts.getValue ("longdesc"));
170
        }
171
 
172
        super.startElement (uri, localName, qName, atts);
173
    }
174
 
175
    private String maybeAddLink (String link)
176
    {
177
        int             index;
178
 
179
        // ignore empty links and fragments inside docs
180
        if (link == null)
181
            return null;
182
        if ((index = link.indexOf ("#")) >= 0)
183
            link = link.substring (0, index);
184
        if (link.equals (""))
185
            return null;
186
 
187
        try {
188
            // get the real URI
189
            URL         base = new URL ((baseURI != null)
190
                                    ? baseURI
191
                                    : getDocumentLocator ().getSystemId ());
192
            URL         url = new URL (base, link);
193
 
194
            link = url.toString ();
195
 
196
            // ignore duplicates
197
            if (vector.contains (link))
198
                return link;
199
 
200
            // other than what "base" does, stick to original site:
201
            if (siteRestricted) {
202
                // don't switch protocols
203
                if (!base.getProtocol ().equals (url.getProtocol ()))
204
                    return link;
205
                // don't switch servers
206
                if (base.getHost () != null
207
                        && !base.getHost ().equals (url.getHost ()))
208
                    return link;
209
            }
210
 
211
            vector.addElement (link);
212
 
213
            return link;
214
 
215
        } catch (IOException e) {
216
            // bad URLs we don't want
217
        }
218
        return null;
219
    }
220
 
221
    /**
222
     * Reports an error if no Locator has been made available.
223
     */
224
    public void startDocument ()
225
    throws SAXException
226
    {
227
        if (getDocumentLocator () == null)
228
            throw new SAXException ("no Locator!");
229
    }
230
 
231
    /**
232
     * Forgets about any base URI information that may be recorded.
233
     * Applications will often want to call removeAllLinks(), likely
234
     * after examining the links which were reported.
235
     */
236
    public void endDocument ()
237
    throws SAXException
238
    {
239
        baseURI = null;
240
        super.endDocument ();
241
    }
242
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.