File Source: LinkbackExtractor.java
/*
P/P * Method: org.apache.roller.weblogger.util.LinkbackExtractor$1__static_init
*/
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. The ASF licenses this file to You
4 * under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License. For additional information regarding
15 * copyright in this work, please see the NOTICE file in the top level
16 * directory of this distribution.
17 */
18 package org.apache.roller.weblogger.util;
19
20 import java.io.BufferedReader;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.io.StringReader;
25 import java.net.MalformedURLException;
26 import java.net.URL;
27 import java.util.Arrays;
28 import java.util.Iterator;
29 import java.util.List;
30
31 import javax.swing.text.MutableAttributeSet;
32 import javax.swing.text.html.HTML;
33 import javax.swing.text.html.HTMLEditorKit;
34 import javax.swing.text.html.HTML.Tag;
35 import javax.swing.text.html.HTMLEditorKit.Parser;
36 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
37
38 import org.apache.commons.logging.Log;
39 import org.apache.commons.logging.LogFactory;
40
41 import com.sun.syndication.feed.synd.SyndEntry;
42 import com.sun.syndication.feed.synd.SyndFeed;
43 import com.sun.syndication.io.FeedException;
44 import com.sun.syndication.io.SyndFeedInput;
45
46 /**
47 * Parses HTML file for referring linkback title and excerpt.
48 *
49 * @author David M Johnson
50 */
/*
P/P * Method: String access$984(LinkbackExtractor, Object)
*
* Preconditions:
* init'ed(x0.mTitle)
* x0 != null
*
* Postconditions:
* java.lang.StringBuilder:toString(...)._tainted == x0.mTitle._tainted | x1._tainted
* init'ed(java.lang.StringBuilder:toString(...)._tainted)
* return_value == &java.lang.StringBuilder:toString(...)
* x0.mTitle == &java.lang.StringBuilder:toString(...)
*/
51 public class LinkbackExtractor
52 {
/*
P/P * Method: org.apache.roller.weblogger.util.LinkbackExtractor__static_init
*
* Presumptions:
* org.apache.commons.logging.LogFactory:getFactory(...)@53 != null
*
* Postconditions:
* init'ed(mLogger)
*/
53 private static Log mLogger = LogFactory.getFactory().getInstance(
54 LinkbackExtractor.class);
55 private boolean mFound = false;
56 private String mTitle = "";
57 private String mRssLink = null;
58 private String mExcerpt = null;
59 private String mPermalink = null;
60 private int mStart = 0;
61 private int mEnd = 0;
62 private int mMaxExcerpt = 500; // characters
63 private String mRequestURL = null;
64 private String mRequestURLWWW = null;
65 private String mRefererURL;
66
67 //------------------------------------------------------------------------
68 /**
69 * Extract referring page title, excerpt, and permalink.
70 *
71 * @param refererUrl
72 * @param requestUrl
73 */
74 public LinkbackExtractor(String refererURL, String requestURL)
75 throws MalformedURLException, IOException
/*
P/P * Method: void org.apache.roller.weblogger.util.LinkbackExtractor(String, String)
*
* Preconditions:
* (soft) mLogger != null
* (soft) requestURL != null
*
* Postconditions:
* init'ed(java.lang.String:substring(...)._tainted)
* init'ed(java.lang.String:toString(...)._tainted)
* init'ed(java.lang.StringBuilder:toString(...)._tainted)
* this.mEnd == 0
* this.mStart == 0
* init'ed(this.mExcerpt)
* init'ed(this.mFound)
* this.mMaxExcerpt == 500
* init'ed(this.mPermalink)
* this.mRefererURL == One-of{refererURL, undefined}
* ...
*
* Test Vectors:
* this.mRssLink: Addr_Set{null}, Inverse{null}
*/
76 {
77 try
78 {
79 extractByParsingHtml(refererURL, requestURL);
80 if (mRssLink != null)
81 {
82 extractByParsingRss(mRssLink, requestURL);
83 }
84 }
85 catch (Exception e)
86 {
87 if (mLogger.isDebugEnabled())
88 {
89 mLogger.debug("Extracting linkback", e);
90 }
91 }
92 }
93
94 //------------------------------------------------------------------------
95 private void extractByParsingHtml(String refererURL, String requestURL)
96 throws MalformedURLException, IOException
97 {
/*
P/P * Method: void extractByParsingHtml(String, String)
*
* Preconditions:
* this.mTitle != null
* requestURL != null
* init'ed(this.mStart)
* (soft) init'ed(this.mEnd)
* (soft) init'ed(this.mMaxExcerpt)
*
* Presumptions:
* javax.swing.text.html.HTMLEditorKit:getParser(...)@118 != null
*
* Postconditions:
* init'ed(java.lang.String:substring(...)._tainted)
* init'ed(java.lang.StringBuilder:toString(...)._tainted)
* possibly_updated(this.mExcerpt)
* this.mRefererURL == refererURL
* init'ed(this.mRefererURL)
* this.mRequestURL == One-of{&java.lang.StringBuilder:toString(...), requestURL}
* this.mRequestURL != null
* this.mRequestURLWWW == One-of{requestURL, &java.lang.StringBuilder:toString(...)}
* this.mRequestURLWWW != null
* this.mTitle == One-of{old this.mTitle, &java.lang.String:substring(...)}
* ...
*
* Test Vectors:
* this.mEnd: {0}, {-231+1..-1, 1..232-1}
* this.mEnd - this.mStart: {-6_442_450_943..0}, {1..6_442_450_943}
* this.mStart: {0}, {-231..-1, 1..232-2}
* java.io.BufferedReader:readLine(...)@129: Inverse{null}, Addr_Set{null}
* java.lang.String:length(...)@155: {0,1}, {2..232-1}
* java.lang.String:startsWith(...)@103: {0}, {1}
* java.lang.String:startsWith(...)@155: {0}, {1}
*/
98 URL url = new URL(refererURL);
99 InputStream is = url.openStream();
100
101 mRefererURL = refererURL;
102
103 if (requestURL.startsWith("http://www."))
104 {
105 mRequestURLWWW = requestURL;
106 mRequestURL = "http://" + mRequestURLWWW.substring(11);
107 }
108 else
109 {
110 mRequestURL = requestURL;
111 mRequestURLWWW = "http://www." + mRequestURL.substring(7);
112 }
113
114 // Trick gets Swing's HTML parser
/*
P/P * Method: void org.apache.roller.weblogger.util.LinkbackExtractor$1(LinkbackExtractor)
*/
115 Parser parser = (new HTMLEditorKit() {
116 public Parser getParser()
117 {
/*
P/P * Method: HTMLEditorKit$Parser getParser()
*
* Postconditions:
* init'ed(return_value)
*/
118 return super.getParser();
119 }
120 }).getParser();
121
122 // Read HTML file into string
123 StringBuffer sb = new StringBuffer();
124 InputStreamReader isr = new InputStreamReader(is);
125 BufferedReader br = new BufferedReader(isr);
126 try
127 {
128 String line = null;
129 while ((line = br.readLine()) != null)
130 {
131 sb.append(line);
132 }
133 }
134 finally
135 {
136 br.close();
137 }
138
139 // Parse HTML string to find title and start and end position
140 // of the referring excerpt.
141 StringReader sr = new StringReader(sb.toString());
142 parser.parse(sr, new LinkbackCallback(), true);
143
144 if (mStart != 0 && mEnd != 0 && mEnd > mStart)
145 {
146 mExcerpt = sb.toString().substring(mStart, mEnd);
147 mExcerpt = Utilities.removeHTML(mExcerpt);
148
149 if (mExcerpt.length() > mMaxExcerpt)
150 {
151 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
152 }
153 }
154
155 if (mTitle.startsWith(">") && mTitle.length() > 1)
156 {
157 mTitle = mTitle.substring(1);
158 }
159 }
160
161 //------------------------------------------------------------------------
162 private void extractByParsingRss(String rssLink, String requestURL)
163 throws IllegalArgumentException, MalformedURLException, FeedException, IOException
164 {
/*
P/P * Method: void extractByParsingRss(String, String)
*
* Preconditions:
* mLogger != null
* (soft) init'ed(this.mFound)
* (soft) init'ed(this.mMaxExcerpt)
*
* Presumptions:
* com.sun.syndication.feed.synd.SyndContent:getValue(...)@182 != null
* com.sun.syndication.feed.synd.SyndEntry:getDescription(...)@182 != null
* com.sun.syndication.feed.synd.SyndEntry:getDescription(...)@194 != null
* com.sun.syndication.feed.synd.SyndEntry:getLink(...)@185 != null
* com.sun.syndication.feed.synd.SyndFeed:getEntries(...)@168 != null
* ...
*
* Postconditions:
* init'ed(java.lang.String:toString(...)._tainted)
* init'ed(java.lang.StringBuilder:toString(...)._tainted)
* possibly_updated(this.mExcerpt)
* (soft) init'ed(this.mFound)
* this.mPermalink == One-of{old this.mPermalink, &java.lang.String:toString(...)}
* possibly_updated(this.mTitle)
*
* Test Vectors:
* com.sun.syndication.feed.synd.SyndFeed:getTitle(...)@169: Addr_Set{null}, Inverse{null}
* java.lang.String:indexOf(...)@182: {-1}, {-231..-2, 0..232-1}
* java.lang.String:length(...)@186: {0}, {1..232-1}
* java.util.Iterator:hasNext(...)@178: {0}, {1}
* org.apache.commons.logging.Log:isDebugEnabled(...)@173: {0}, {1}
* org.apache.commons.logging.Log:isDebugEnabled(...)@204: {0}, {1}
*/
165 SyndFeedInput feedInput = new SyndFeedInput();
166 SyndFeed feed = feedInput.build(
167 new InputStreamReader(new URL(rssLink).openStream()));
168 Iterator itemIter = feed.getEntries().iterator();
169 String feedTitle = feed.getTitle();
170
171 int count = 0;
172
173 if (mLogger.isDebugEnabled())
174 {
175 mLogger.debug("Feed parsed, title: " + feedTitle);
176 }
177
178 while (itemIter.hasNext())
179 {
+ 180 count++;
181 SyndEntry item = (SyndEntry) itemIter.next();
182 if (item.getDescription().getValue().indexOf(requestURL) != -1)
183 {
184 mFound = true;
185 mPermalink = item.getLink().toString();
186 if (feedTitle != null && feedTitle.trim().length() > 0)
187 {
188 mTitle = feedTitle + ": " + item.getTitle();
189 }
190 else
191 {
192 mTitle = item.getTitle();
193 }
194 mExcerpt = item.getDescription().getValue();
195 mExcerpt = Utilities.removeHTML(mExcerpt);
196 if (mExcerpt.length() > mMaxExcerpt)
197 {
198 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
199 }
200 break;
201 }
202 }
203
204 if (mLogger.isDebugEnabled())
205 {
206 mLogger.debug("Parsed " + count + " articles, found linkback="
207 + mFound);
208 }
209 }
210
211 //------------------------------------------------------------------------
212 /**
213 * Returns the excerpt.
214 *
215 * @return String
216 */
217 public String getExcerpt()
218 {
/*
P/P * Method: String getExcerpt()
*
* Preconditions:
* init'ed(this.mExcerpt)
*
* Postconditions:
* return_value == this.mExcerpt
* init'ed(return_value)
*/
219 return mExcerpt;
220 }
221
222 //------------------------------------------------------------------------
223 /**
224 * Returns the title.
225 *
226 * @return String
227 */
228 public String getTitle()
229 {
/*
P/P * Method: String getTitle()
*
* Preconditions:
* init'ed(this.mTitle)
*
* Postconditions:
* return_value == this.mTitle
* init'ed(return_value)
*/
230 return mTitle;
231 }
232
233 //------------------------------------------------------------------------
234 /**
235 * Returns the permalink.
236 *
237 * @return String
238 */
239 public String getPermalink()
240 {
/*
P/P * Method: String getPermalink()
*
* Preconditions:
* init'ed(this.mPermalink)
*
* Postconditions:
* return_value == this.mPermalink
* init'ed(return_value)
*/
241 return mPermalink;
242 }
243
244 //------------------------------------------------------------------------
245 /**
246 * Sets the permalink.
247 *
248 * @param permalink
249 * The permalink to set
250 */
251 public void setPermalink(String permalink)
252 {
/*
P/P * Method: void setPermalink(String)
*
* Postconditions:
* this.mPermalink == permalink
* init'ed(this.mPermalink)
*/
253 mPermalink = permalink;
254 }
255
256 /////////////////////////////////////////////////////////////////////////
257
258 /**
259 * Parser callback that finds title and excerpt. As we walk through the HTML
260 * tags, we keep track of the most recently encountered divider tag in the
261 * mStart field. Once we find the referring permalink, we set the mFound
262 * flag. After that, we look for the next divider tag and save it's position
263 * in the mEnd field.
264 */
/*
P/P * Method: void org.apache.roller.weblogger.util.LinkbackExtractor$LinkbackCallback(LinkbackExtractor, LinkbackExtractor$1)
*
* Presumptions:
* init'ed(javax.swing.text.html.HTML$Tag.BLOCKQUOTE)
* init'ed(javax.swing.text.html.HTML$Tag.BR)
* init'ed(javax.swing.text.html.HTML$Tag.DIV)
* init'ed(javax.swing.text.html.HTML$Tag.H1)
* init'ed(javax.swing.text.html.HTML$Tag.H2)
* ...
*
* Postconditions:
* this.mCurrentTag == null
* this.mDivTags == &new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1)
* init'ed(this.mList)
* new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1) num objects == 1
* new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1).length == 15
* new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1)[0] == javax.swing.text.html.HTML$Tag.TD
* init'ed(new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1)[0])
* new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1)[10] == javax.swing.text.html.HTML$Tag.H2
* init'ed(new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1)[10])
* new HTML$Tag[](LinkbackExtractor$LinkbackCallback#1)[11] == javax.swing.text.html.HTML$Tag.H3
* ...
*/
265 private final class LinkbackCallback extends ParserCallback
266 {
267 // Dividers
268 private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
269 Tag.BLOCKQUOTE, Tag.P, Tag.LI,
270 Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
271 Tag.H2, Tag.H3, Tag.H4, Tag.H5,
272 Tag.H6 };
273
274 private List mList = Arrays.asList(mDivTags);
275
276 private Tag mCurrentTag = null;
277
278 /**
279 * Look for divider tags and for the permalink.
280 *
281 * @param tag
282 * HTML tag
283 * @param atts
284 * Attributes of that tag
285 * @param pos
286 * Tag's position in file
287 */
288 public void handleStartTag(Tag tag, MutableAttributeSet atts, int pos)
289 {
/*
P/P * Method: void handleStartTag(HTML$Tag, MutableAttributeSet, int)
*
* Preconditions:
* this.mList != null
* (soft) atts != null
* (soft) init'ed(this.mEnd)
* (soft) init'ed(this.mFound)
* (soft) tag != null
* (soft) init'ed(this.mRequestURL)
* (soft) init'ed(this.mRequestURLWWW)
*
* Presumptions:
* init'ed(javax.swing.text.html.HTML$Attribute.HREF)
* init'ed(javax.swing.text.html.HTML$Tag.A)
*
* Postconditions:
* this.mCurrentTag == One-of{old this.mCurrentTag, tag}
* this.mEnd == One-of{old this.mEnd, pos}
* (soft) init'ed(this.mEnd)
* (soft) init'ed(this.mFound)
* this.mStart == One-of{old this.mStart, pos}
*
* Test Vectors:
* this.mEnd: {-231..-1, 1..232-1}, {0}
* this.mFound: {1}, {0}
* java.lang.Object:equals(...)@298: {0}, {1}
* java.lang.String:equals(...)@308: {1}, {0}
* java.lang.String:equals(...)@308: {0}, {1}
* java.lang.String:lastIndexOf(...)@303: {-1}, {-231..-2, 0..232-1}
* java.util.List:contains(...)@290: {0}, {1}
* java.util.List:contains(...)@294: {0}, {1}
* javax.swing.text.MutableAttributeSet:getAttribute(...)@300: Inverse{null}, Addr_Set{null}
*/
290 if (mList.contains(tag) && !mFound)
291 {
292 mStart = pos;
293 }
294 else if (mList.contains(tag) && mFound && mEnd == 0)
295 {
296 mEnd = pos;
297 }
298 else if (tag.equals(Tag.A))
299 {
300 String href = (String) atts.getAttribute(HTML.Attribute.HREF);
301 if (href == null)
302 return;
303 int hashPos = href.lastIndexOf('#');
304 if (hashPos != -1)
305 {
306 href = href.substring(0, hashPos);
307 }
+ 308 if (href != null
309 && (href.equals(mRequestURL) || href
310 .equals(mRequestURLWWW)))
311 {
312 mFound = true;
313 }
314 else
315 {
316 /*
317 * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
318 * "+href); }
319 */
320 }
321 }
322 mCurrentTag = tag;
323 }
324
325 /**
326 * Needed to handle SPAN tag.
327 */
328 public void handleSimpleTag(Tag tag, MutableAttributeSet atts, int pos)
329 {
/*
P/P * Method: void handleSimpleTag(HTML$Tag, MutableAttributeSet, int)
*
* Preconditions:
* this.mList != null
* (soft) atts != null
* (soft) init'ed(this.mEnd)
* (soft) org/apache/roller/weblogger/util/LinkbackExtractor.mLogger != null
* (soft) tag != null
* (soft) init'ed(this.mFound)
* (soft) this.mRefererURL != null
*
* Presumptions:
* javax.swing.text.MutableAttributeSet:getAttribute(...)@343 != null
* init'ed(javax.swing.text.html.HTML$Attribute.HREF)
* init'ed(javax.swing.text.html.HTML$Attribute.TITLE)
* init'ed(javax.swing.text.html.HTML$Attribute.TYPE)
* init'ed(javax.swing.text.html.HTML$Tag.LINK)
*
* Postconditions:
* init'ed(java.lang.StringBuilder:toString(...)._tainted)
* (soft) init'ed(this.mEnd)
* possibly_updated(this.mRssLink)
*
* Test Vectors:
* this.mEnd: {-231..-1, 1..232-1}, {0}
* this.mFound: {0}, {1}
* java.lang.Object:equals(...)@334: {0}, {1}
* java.lang.String:equals(...)@339: {0}, {1}
* java.lang.String:lastIndexOf(...)@370: {-1}, {-231..-2, 0..232-1}
* java.lang.String:length(...)@350: {0,1}, {2..232-1}
* java.lang.String:startsWith(...)@350: {0}, {1}
* java.lang.String:startsWith(...)@368: {1}, {0}
* java.util.List:contains(...)@330: {0}, {1}
* javax.swing.text.MutableAttributeSet:getAttribute(...)@337: Addr_Set{null}, Inverse{null}
* ...
*/
330 if (mList.contains(tag) && mFound && mEnd == 0)
331 {
332 mEnd = pos;
333 }
334 else if (tag.equals(Tag.LINK))
335 {
336 // Look out for RSS autodiscovery link
337 String title = (String) atts.getAttribute(HTML.Attribute.TITLE);
338 String type = (String) atts.getAttribute(HTML.Attribute.TYPE);
339 if (title != null && type != null
340 && type.equals("application/rss+xml")
341 && title.equals("RSS"))
342 {
343 mRssLink = (String) atts.getAttribute(HTML.Attribute.HREF);
344
345 if (mLogger.isDebugEnabled())
346 {
347 mLogger.debug("Found RSS link " + mRssLink);
348 }
349
350 if (mRssLink.startsWith("/") && mRssLink.length() > 1)
351 {
352 try
353 {
354 URL url = new URL(mRefererURL);
355 mRssLink = url.getProtocol() + "://"
356 + url.getHost() + ":" + url.getPort()
357 + mRssLink;
358 }
359 catch (MalformedURLException e)
360 {
361 mRssLink = null;
362 if (mLogger.isDebugEnabled())
363 {
364 mLogger.debug("Determining RSS URL", e);
365 }
366 }
367 }
368 else if (!mRssLink.startsWith("http"))
369 {
370 int slash = mRefererURL.lastIndexOf("/");
371 if (slash != -1)
372 {
373 mRssLink = mRefererURL.substring(0, slash) + "/"
374 + mRssLink;
375 }
376 }
377 if (mLogger.isDebugEnabled())
378 {
379 mLogger.debug("Qualified RSS link is " + mRssLink);
380 }
381 }
382 }
383 }
384
385 /**
386 * Stop at the very first divider tag after the permalink.
387 *
388 * @param tag
389 * End tag
390 * @param pos
391 * Position in HTML file
392 */
393 public void handleEndTag(Tag tag, int pos)
394 {
/*
P/P * Method: void handleEndTag(HTML$Tag, int)
*
* Preconditions:
* this.mList != null
* (soft) init'ed(this.mEnd)
* (soft) init'ed(this.mFound)
*
* Postconditions:
* this.mCurrentTag == One-of{old this.mCurrentTag, null}
* this.mEnd == One-of{pos, old this.mEnd}
* (soft) init'ed(this.mEnd)
* this.mStart == One-of{old this.mStart, pos}
*
* Test Vectors:
* this.mEnd: {-231..-1, 1..232-1}, {0}
* this.mFound: {0}, {1}
* java.util.List:contains(...)@395: {0}, {1}
* java.util.List:contains(...)@399: {0}, {1}
*/
395 if (mList.contains(tag) && mFound && mEnd == 0)
396 {
397 mEnd = pos;
398 }
399 else if (mList.contains(tag) && !mFound)
400 {
401 mStart = pos;
402 }
403 else
404 {
405 mCurrentTag = null;
406 }
407 }
408
409 /**
410 * Get the page title
411 */
412 public void handleText(char[] data, int pos)
413 {
/*
P/P * Method: void handleText(char[], int)
*
* Preconditions:
* init'ed(this.mCurrentTag)
* (soft) this.mTitle != null
*
* Presumptions:
* init'ed(javax.swing.text.html.HTML$Tag.TITLE)
*
* Postconditions:
* init'ed(java.lang.StringBuilder:toString(...)._tainted)
* this.mTitle == One-of{old this.mTitle, &java.lang.StringBuilder:toString(...)}
* (soft) this.mTitle != null
*
* Test Vectors:
* this.mCurrentTag: Addr_Set{null}, Inverse{null}
* java.lang.Object:equals(...)@414: {0}, {1}
* java.lang.String:length(...)@417: {50..232-1}, {0..49}
*/
414 if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
415 {
416 String newText = new String(data);
417 if (mTitle.length() < 50)
418 {
419 mTitle += newText;
420 }
421 }
422 }
423 }
424 }
425
SofCheck Inspector Build Version : 2.18479
| LinkbackExtractor.java |
2009-Jan-02 14:24:48 |
| LinkbackExtractor.class |
2009-Sep-04 03:12:32 |
| LinkbackExtractor$1.class |
2009-Sep-04 03:12:32 |
| LinkbackExtractor$LinkbackCallback.class |
2009-Sep-04 03:12:32 |