View Javadoc
1   /*
2    *  This file is part of the Heritrix web crawler (crawler.archive.org).
3    *
4    *  Licensed to the Internet Archive (IA) by one or more individual 
5    *  contributors. 
6    *
7    *  The IA licenses this file to You under the Apache License, Version 2.0
8    *  (the "License"); you may not use this file except in compliance with
9    *  the License.  You may obtain a copy of the License at
10   *
11   *      http://www.apache.org/licenses/LICENSE-2.0
12   *
13   *  Unless required by applicable law or agreed to in writing, software
14   *  distributed under the License is distributed on an "AS IS" BASIS,
15   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   *  See the License for the specific language governing permissions and
17   *  limitations under the License.
18   */
19  package org.archive.wayback.accesscontrol.robotstxt;
20  
21  import java.io.BufferedReader;
22  import java.io.IOException;
23  import java.io.Serializable;
24  import java.util.HashMap;
25  import java.util.LinkedList;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import org.apache.commons.io.IOUtils;
32  import org.archive.io.ReadSource;
33  
34  /**
35   * Utility class for parsing and representing 'robots.txt' format 
36   * directives, into a list of named user-agents and map from user-agents 
37   * to RobotsDirectives. 
38   */
39  public class Robotstxt implements Serializable {
40      static final long serialVersionUID = 7025386509301303890L;
41      private static final Logger logger =
42          Logger.getLogger(Robotstxt.class.getName());
43  
44      // all user agents contained in this robots.txt
45      // in order of declaration
46      // TODO: consider discarding irrelevant entries
47      LinkedList<String> namedUserAgents = new LinkedList<String>();
48      // map user-agents to directives
49      Map<String,RobotsDirectives> agentsToDirectives = 
50          new HashMap<String,RobotsDirectives>();
51      RobotsDirectives wildcardDirectives = null; 
52      
53      boolean hasErrors = false;
54      
55      static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
56      /** empty, reusable instance for all sites providing no rules */
57      public static Robotstxt NO_ROBOTS = new Robotstxt();
58      
59      public Robotstxt() {
60      }
61  
62      public Robotstxt(BufferedReader reader) throws IOException {
63          initializeFromReader(reader);
64      }
65  
66      public Robotstxt(ReadSource customRobots) {
67          BufferedReader reader = new BufferedReader(customRobots.obtainReader());
68          try {
69              initializeFromReader(reader);
70          } catch (IOException e) {
71              logger.log(Level.SEVERE,
72                      "robots ReadSource problem: potential for inadvertent overcrawling",
73                      e);
74          } finally {
75              IOUtils.closeQuietly(reader); 
76          }
77      }
78  
79      protected void initializeFromReader(BufferedReader reader) throws IOException {
80          String read;
81          // current is the disallowed paths for the preceding User-Agent(s)
82          RobotsDirectives current = null;
83          // whether a non-'User-Agent' directive has been encountered
84          boolean hasDirectivesYet = false; 
85          while (reader != null) {
86              do {
87                  read = reader.readLine();
88                  // Skip comments & blanks
89              } while ((read != null) && ((read = read.trim()).startsWith("#") ||
90                  read.length() == 0));
91              if (read == null) {
92                  reader.close();
93                  reader = null;
94              } else {
95                  // remove any html markup
96                  read = read.replaceAll("<[^>]+>","");
97                  int commentIndex = read.indexOf("#");
98                  if (commentIndex > -1) {
99                      // Strip trailing comment
100                     read = read.substring(0, commentIndex);
101                 }
102                 read = read.trim();
103                 if (read.matches("(?i)^User-agent:.*")) {
104                     String ua = read.substring(11).trim().toLowerCase();
105                     if (current == null || hasDirectivesYet ) {
106                         // only create new rules-list if necessary
107                         // otherwise share with previous user-agent
108                         current = new RobotsDirectives();
109                         hasDirectivesYet = false; 
110                     }
111                     if (ua.equals("*")) {
112                         wildcardDirectives = current;
113                     } else {
114                         namedUserAgents.addLast(ua);
115                         agentsToDirectives.put(ua, current);
116                     }
117                     continue;
118                 }
119                 if (read.matches("(?i)Disallow:.*")) {
120                     if (current == null) {
121                         // buggy robots.txt
122                         hasErrors = true;
123                         continue;
124                     }
125                     String path = read.substring(9).trim();
126                     // tolerate common error of ending path with '*' character
127                     // (not allowed by original spec; redundant but harmless with 
128                     // Google's wildcarding extensions -- which we don't yet fully
129                     // support). 
130                     if(path.endsWith("*")) {
131                         path = path.substring(0,path.length()-1); 
132                     }
133                     current.addDisallow(path);
134                     hasDirectivesYet = true; 
135                     continue;
136                 }
137                 if (read.matches("(?i)Crawl-delay:.*")) {
138                     if (current == null) {
139                         // buggy robots.txt
140                         hasErrors = true;
141                         continue;
142                     }
143                     // consider a crawl-delay, even though we don't 
144                     // yet understand it, as sufficient to end a 
145                     // grouping of User-Agent lines
146                     hasDirectivesYet = true;
147                     String val = read.substring(12).trim();
148                     val = val.split("[^\\d\\.]+")[0];
149                     try {
150                         current.setCrawlDelay(Float.parseFloat(val));
151                     } catch (NumberFormatException nfe) {
152                         // ignore
153                     }
154                     continue;
155                 }
156                 if (read.matches("(?i)Allow:.*")) {
157                     if (current == null) {
158                         // buggy robots.txt
159                         hasErrors = true;
160                         continue;
161                     }
162                     String path = read.substring(6).trim();
163                     // tolerate common error of ending path with '*' character
164                     // (not allowed by original spec; redundant but harmless with 
165                     // Google's wildcarding extensions -- which we don't yet fully
166                     // support). 
167                     if(path.endsWith("*")) {
168                         path = path.substring(0,path.length()-1); 
169                     }
170                     current.addAllow(path);
171                     hasDirectivesYet = true;
172                     continue;
173                 }
174                 // unknown line; do nothing for now
175             }
176         }
177     }
178 
179     /**
180      * Does this policy effectively allow everything? (No 
181      * disallows or timing (crawl-delay) directives?)
182      * @return <b>true</b> if the map containing user-agents to directives is empty, <b>false</b> otherwise.
183      */
184     public boolean allowsAll() {
185         // TODO: refine so directives that are all empty are also 
186         // recognized as allowing all
187         return agentsToDirectives.isEmpty();
188     }
189     
190     public List<String> getNamedUserAgents() {
191         return namedUserAgents;
192     }
193 
194     /**
195      * Return the RobotsDirectives, if any, appropriate for the given User-Agent
196      * string. If useFallbacks is true, a wildcard ('*') directives or the default
197      * of NO_DIRECTIVES will be returned, as appropriate, if there is no better
198      * match. If useFallbacks is false, a null will be returned if no declared
199      * directives targeted the given User-Agent.
200      * 
201      * @param ua String User-Agent to lookup
202      * @param useFallbacks if true, fall-back to wildcard directives or 
203      * default allow as needed
204      * @return directives to use, or null if useFallbacks is false and no 
205      * non-wildcard directives match the supplied User-Agent
206      */
207     public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) {
208         // find matching ua
209         for(String uaListed : namedUserAgents) {
210             if(ua.indexOf(uaListed)>-1) {
211                 return agentsToDirectives.get(uaListed);
212             }
213         }
214         if(useFallbacks==false) {
215             return null; 
216         }
217         if (wildcardDirectives!=null) {
218             return wildcardDirectives;
219         }
220         // no applicable user-agents, so empty directives
221         return NO_DIRECTIVES; 
222     }
223 
224     /**
225      * Return directives to use for the given User-Agent, resorting to wildcard
226      * rules or the default no-directives if necessary.
227      * 
228      * @param userAgent String User-Agent to lookup
229      * @return directives to use
230      */
231     public RobotsDirectives getDirectivesFor(String userAgent) {
232         return getDirectivesFor(userAgent, true);
233     }
234 }