View Javadoc
1   /*
2    *  This file is part of the Wayback archival access software
3    *   (http://archive-access.sourceforge.net/projects/wayback/).
4    *
5    *  Licensed to the Internet Archive (IA) by one or more individual 
6    *  contributors. 
7    *
8    *  The IA licenses this file to You under the Apache License, Version 2.0
9    *  (the "License"); you may not use this file except in compliance with
10   *  the License.  You may obtain a copy of the License at
11   *
12   *      http://www.apache.org/licenses/LICENSE-2.0
13   *
14   *  Unless required by applicable law or agreed to in writing, software
15   *  distributed under the License is distributed on an "AS IS" BASIS,
16   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   *  See the License for the specific language governing permissions and
18   *  limitations under the License.
19   */
20  package org.archive.wayback.accesscontrol.robotstxt;
21  
22  import java.io.BufferedReader;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.util.ArrayList;
27  import java.util.HashMap;
28  import java.util.Iterator;
29  import java.util.LinkedList;
30  import java.util.List;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  
36  import org.archive.wayback.util.ByteOp;
37  
38  /**
39   * Class which parses a robots.txt file, storing the rules contained therein,
40   * and then allows for testing if path/userAgent tuples are blocked by those
41   * rules.
42   *
43   * @author brad
44   * @version $Date$, $Revision$
45   */
46  public class RobotRules {
47  	
48  	private static final long serialVersionUID = 2917420727021840982L;
49  	private static final Logger LOGGER = Logger.getLogger(RobotRules.class
50  			.getName());
51  	/**
52  	 * Special name for User-agent which matches all values
53  	 */
54  	public static final String GLOBAL_USER_AGENT = "*";
55  	
56  	protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent\\s*:(.*)");
57  	protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow\\s*:(.*)");
58  	protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow\\s*:(.*)");
59  	
60  	private boolean bSyntaxErrors = false;
61  	private HashMap<String, ArrayList<String>> rules = 
62  		new HashMap<String, ArrayList<String>>();
63  
64  	private LinkedList<String> userAgents = new LinkedList<String>();
65  
66  	/**
67  	 * @return true if the robots.txt file looked suspicious, currently meaning
68  	 * we found a Disallow rule that was not preceded by a "User-agent:" line
69  	 */
70  	public boolean hasSyntaxErrors() {
71  		return bSyntaxErrors;
72  	}
73  	
74  	/**
75  	 * @return a List of all UserAgents Found in the Robots.txt document
76  	 */
77  	public List<String> getUserAgentsFound() {
78  		return userAgents;
79  	}
80  	
81  	/**
82  	 * Read rules from InputStream argument into this RobotRules, as a 
83  	 * side-effect, sets the bSyntaxErrors property.
84  	 * 
85  	 * @param is InputStream containing the robots.txt document
86  	 * @throws IOException for usual reasons
87  	 */
88  	public void parse(InputStream is) throws IOException {
89  		
90  		BufferedReader br = new BufferedReader(new InputStreamReader(
91  				(InputStream) is,ByteOp.UTF8));
92          String read;
93          boolean allowRuleFound = false;
94          // true if curr or last line read was a User-agent line 
95          boolean currLineUA = false; 
96          boolean lastLineUA = false;
97          ArrayList<String> current = null;
98          while (br != null) {
99              lastLineUA = currLineUA;
100             do {
101                 read = br.readLine();
102                 // Skip comments & blanks
103             } while ((read != null) && ((read = read.trim()).startsWith("#") ||
104                 read.length() == 0));
105             if (read == null) {
106             	br.close();
107             	br = null;
108             } else {
109                 currLineUA = false;
110                 int commentIndex = read.indexOf("#");
111                 if (commentIndex > -1) {
112                     // Strip trailing comment
113                     read = read.substring(0, commentIndex);
114                 }
115                 read = read.trim();
116                 Matcher uaMatcher = USER_AGENT_PATTERN.matcher(read);
117                 if (uaMatcher.matches()) {
118                     String ua = uaMatcher.group(1).trim().toLowerCase();
119                     if (current == null || current.size() != 0 || allowRuleFound || !lastLineUA) {
120                         // only create new rules-list if necessary
121                         // otherwise share with previous user-agent
122                         current = new ArrayList<String>();
123                     }
124                     rules.put(ua, current);
125                     allowRuleFound = false;
126                     currLineUA = true;
127                     LOGGER.fine("Found User-agent(" + ua + ") rules...");
128                     continue;
129                 }
130                 Matcher disallowMatcher = DISALLOW_PATTERN.matcher(read);
131                 if (disallowMatcher.matches()) {
132                 	if (current == null) {
133                         // buggy robots.txt
134                     	bSyntaxErrors = true;
135                         continue;
136                     }
137                     String path = disallowMatcher.group(1).trim();
138                     current.add(path);
139                     continue;
140                 }
141                 Matcher allowMatcher = ALLOW_PATTERN.matcher(read);
142                 if (allowMatcher.matches()) {
143                 	// Mark that there was an allow rule to clear the current list for next user-agent
144                 	allowRuleFound = true;
145                 }
146                 // unknown line; do nothing for now
147                 
148                 // TODO: check for "Allow" lines, and flag a syntax error if 
149                 //       we encounter any unknown lines?
150             }
151         }
152     }
153 	
154 	private boolean blocksPath(String path, String curUA, List<String> uaRules) {
155 		
156 		Iterator<String> disItr = uaRules.iterator();
157 		while (disItr.hasNext()) {
158 			String disallowedPath = disItr.next();
159 			if (disallowedPath.length() == 0) {
160 
161 				if (LOGGER.isLoggable(Level.INFO)) {
162 					LOGGER.info("UA(" + curUA
163 							+ ") has empty disallow: Go for it!");
164 				}
165 				return false;
166 
167 			} else {
168 				if (LOGGER.isLoggable(Level.FINE)) {
169 					LOGGER.fine("UA(" + curUA + ") has (" + disallowedPath
170 							+ ") blocked...(" + disallowedPath.length() + ")");
171 				}
172 				if (disallowedPath.equals("/") || path.startsWith(disallowedPath)) {
173 					if (LOGGER.isLoggable(Level.INFO)) {
174 						LOGGER.info("Rule(" + disallowedPath + ") applies to (" +
175 								path + ")");
176 					}
177 					return true;
178 				}
179 			}
180 		}
181 		return false;
182 	}
183 	
184 	/**
185 	 * Checks first the specified ua UserAgent, if rules are present for it,
186 	 * and then falls back to using rules for the '*' UserAgent.
187 	 * 
188 	 * @param path String server relative path to check for access
189 	 * @param ua String user agent to check for access
190 	 * @return boolean value where true indicates the path is blocked for ua
191 	 */
192 	public boolean blocksPathForUA(String path, String ua) {
193 
194 		if(rules.containsKey(ua.toLowerCase())) {
195 
196 			return blocksPath(path,ua,rules.get(ua.toLowerCase()));
197 
198 		} else if(rules.containsKey(GLOBAL_USER_AGENT)) {
199 
200 			return blocksPath(path,GLOBAL_USER_AGENT,
201 					rules.get(GLOBAL_USER_AGENT));			
202 		}
203 		return false;
204 	}
205 }