1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.archive.wayback.accesscontrol.robotstxt;
20
21 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.Serializable;
24 import java.util.HashMap;
25 import java.util.LinkedList;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.apache.commons.io.IOUtils;
32 import org.archive.io.ReadSource;
33
34
35
36
37
38
39 public class Robotstxt implements Serializable {
40 static final long serialVersionUID = 7025386509301303890L;
41 private static final Logger logger =
42 Logger.getLogger(Robotstxt.class.getName());
43
44
45
46
47 LinkedList<String> namedUserAgents = new LinkedList<String>();
48
49 Map<String,RobotsDirectives> agentsToDirectives =
50 new HashMap<String,RobotsDirectives>();
51 RobotsDirectives wildcardDirectives = null;
52
53 boolean hasErrors = false;
54
55 static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
56
57 public static Robotstxt NO_ROBOTS = new Robotstxt();
58
59 public Robotstxt() {
60 }
61
62 public Robotstxt(BufferedReader reader) throws IOException {
63 initializeFromReader(reader);
64 }
65
66 public Robotstxt(ReadSource customRobots) {
67 BufferedReader reader = new BufferedReader(customRobots.obtainReader());
68 try {
69 initializeFromReader(reader);
70 } catch (IOException e) {
71 logger.log(Level.SEVERE,
72 "robots ReadSource problem: potential for inadvertent overcrawling",
73 e);
74 } finally {
75 IOUtils.closeQuietly(reader);
76 }
77 }
78
79 protected void initializeFromReader(BufferedReader reader) throws IOException {
80 String read;
81
82 RobotsDirectives current = null;
83
84 boolean hasDirectivesYet = false;
85 while (reader != null) {
86 do {
87 read = reader.readLine();
88
89 } while ((read != null) && ((read = read.trim()).startsWith("#") ||
90 read.length() == 0));
91 if (read == null) {
92 reader.close();
93 reader = null;
94 } else {
95
96 read = read.replaceAll("<[^>]+>","");
97 int commentIndex = read.indexOf("#");
98 if (commentIndex > -1) {
99
100 read = read.substring(0, commentIndex);
101 }
102 read = read.trim();
103 if (read.matches("(?i)^User-agent:.*")) {
104 String ua = read.substring(11).trim().toLowerCase();
105 if (current == null || hasDirectivesYet ) {
106
107
108 current = new RobotsDirectives();
109 hasDirectivesYet = false;
110 }
111 if (ua.equals("*")) {
112 wildcardDirectives = current;
113 } else {
114 namedUserAgents.addLast(ua);
115 agentsToDirectives.put(ua, current);
116 }
117 continue;
118 }
119 if (read.matches("(?i)Disallow:.*")) {
120 if (current == null) {
121
122 hasErrors = true;
123 continue;
124 }
125 String path = read.substring(9).trim();
126
127
128
129
130 if(path.endsWith("*")) {
131 path = path.substring(0,path.length()-1);
132 }
133 current.addDisallow(path);
134 hasDirectivesYet = true;
135 continue;
136 }
137 if (read.matches("(?i)Crawl-delay:.*")) {
138 if (current == null) {
139
140 hasErrors = true;
141 continue;
142 }
143
144
145
146 hasDirectivesYet = true;
147 String val = read.substring(12).trim();
148 val = val.split("[^\\d\\.]+")[0];
149 try {
150 current.setCrawlDelay(Float.parseFloat(val));
151 } catch (NumberFormatException nfe) {
152
153 }
154 continue;
155 }
156 if (read.matches("(?i)Allow:.*")) {
157 if (current == null) {
158
159 hasErrors = true;
160 continue;
161 }
162 String path = read.substring(6).trim();
163
164
165
166
167 if(path.endsWith("*")) {
168 path = path.substring(0,path.length()-1);
169 }
170 current.addAllow(path);
171 hasDirectivesYet = true;
172 continue;
173 }
174
175 }
176 }
177 }
178
179
180
181
182
183
184 public boolean allowsAll() {
185
186
187 return agentsToDirectives.isEmpty();
188 }
189
190 public List<String> getNamedUserAgents() {
191 return namedUserAgents;
192 }
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207 public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) {
208
209 for(String uaListed : namedUserAgents) {
210 if(ua.indexOf(uaListed)>-1) {
211 return agentsToDirectives.get(uaListed);
212 }
213 }
214 if(useFallbacks==false) {
215 return null;
216 }
217 if (wildcardDirectives!=null) {
218 return wildcardDirectives;
219 }
220
221 return NO_DIRECTIVES;
222 }
223
224
225
226
227
228
229
230
231 public RobotsDirectives getDirectivesFor(String userAgent) {
232 return getDirectivesFor(userAgent, true);
233 }
234 }