1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.archive.wayback.accesscontrol.robotstxt;
21
22 import java.io.BufferedReader;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.Iterator;
29 import java.util.LinkedList;
30 import java.util.List;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35
36 import org.archive.wayback.util.ByteOp;
37
38
39
40
41
42
43
44
45
46 public class RobotRules {
47
48 private static final long serialVersionUID = 2917420727021840982L;
49 private static final Logger LOGGER = Logger.getLogger(RobotRules.class
50 .getName());
51
52
53
54 public static final String GLOBAL_USER_AGENT = "*";
55
56 protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent\\s*:(.*)");
57 protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow\\s*:(.*)");
58 protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow\\s*:(.*)");
59
60 private boolean bSyntaxErrors = false;
61 private HashMap<String, ArrayList<String>> rules =
62 new HashMap<String, ArrayList<String>>();
63
64 private LinkedList<String> userAgents = new LinkedList<String>();
65
66
67
68
69
70 public boolean hasSyntaxErrors() {
71 return bSyntaxErrors;
72 }
73
74
75
76
77 public List<String> getUserAgentsFound() {
78 return userAgents;
79 }
80
81
82
83
84
85
86
87
88 public void parse(InputStream is) throws IOException {
89
90 BufferedReader br = new BufferedReader(new InputStreamReader(
91 (InputStream) is,ByteOp.UTF8));
92 String read;
93 boolean allowRuleFound = false;
94
95 boolean currLineUA = false;
96 boolean lastLineUA = false;
97 ArrayList<String> current = null;
98 while (br != null) {
99 lastLineUA = currLineUA;
100 do {
101 read = br.readLine();
102
103 } while ((read != null) && ((read = read.trim()).startsWith("#") ||
104 read.length() == 0));
105 if (read == null) {
106 br.close();
107 br = null;
108 } else {
109 currLineUA = false;
110 int commentIndex = read.indexOf("#");
111 if (commentIndex > -1) {
112
113 read = read.substring(0, commentIndex);
114 }
115 read = read.trim();
116 Matcher uaMatcher = USER_AGENT_PATTERN.matcher(read);
117 if (uaMatcher.matches()) {
118 String ua = uaMatcher.group(1).trim().toLowerCase();
119 if (current == null || current.size() != 0 || allowRuleFound || !lastLineUA) {
120
121
122 current = new ArrayList<String>();
123 }
124 rules.put(ua, current);
125 allowRuleFound = false;
126 currLineUA = true;
127 LOGGER.fine("Found User-agent(" + ua + ") rules...");
128 continue;
129 }
130 Matcher disallowMatcher = DISALLOW_PATTERN.matcher(read);
131 if (disallowMatcher.matches()) {
132 if (current == null) {
133
134 bSyntaxErrors = true;
135 continue;
136 }
137 String path = disallowMatcher.group(1).trim();
138 current.add(path);
139 continue;
140 }
141 Matcher allowMatcher = ALLOW_PATTERN.matcher(read);
142 if (allowMatcher.matches()) {
143
144 allowRuleFound = true;
145 }
146
147
148
149
150 }
151 }
152 }
153
154 private boolean blocksPath(String path, String curUA, List<String> uaRules) {
155
156 Iterator<String> disItr = uaRules.iterator();
157 while (disItr.hasNext()) {
158 String disallowedPath = disItr.next();
159 if (disallowedPath.length() == 0) {
160
161 if (LOGGER.isLoggable(Level.INFO)) {
162 LOGGER.info("UA(" + curUA
163 + ") has empty disallow: Go for it!");
164 }
165 return false;
166
167 } else {
168 if (LOGGER.isLoggable(Level.FINE)) {
169 LOGGER.fine("UA(" + curUA + ") has (" + disallowedPath
170 + ") blocked...(" + disallowedPath.length() + ")");
171 }
172 if (disallowedPath.equals("/") || path.startsWith(disallowedPath)) {
173 if (LOGGER.isLoggable(Level.INFO)) {
174 LOGGER.info("Rule(" + disallowedPath + ") applies to (" +
175 path + ")");
176 }
177 return true;
178 }
179 }
180 }
181 return false;
182 }
183
184
185
186
187
188
189
190
191
192 public boolean blocksPathForUA(String path, String ua) {
193
194 if(rules.containsKey(ua.toLowerCase())) {
195
196 return blocksPath(path,ua,rules.get(ua.toLowerCase()));
197
198 } else if(rules.containsKey(GLOBAL_USER_AGENT)) {
199
200 return blocksPath(path,GLOBAL_USER_AGENT,
201 rules.get(GLOBAL_USER_AGENT));
202 }
203 return false;
204 }
205 }