View Javadoc
1   /*
2    *  This file is part of the Heritrix web crawler (crawler.archive.org).
3    *
4    *  Licensed to the Internet Archive (IA) by one or more individual 
5    *  contributors. 
6    *
7    *  The IA licenses this file to You under the Apache License, Version 2.0
8    *  (the "License"); you may not use this file except in compliance with
9    *  the License.  You may obtain a copy of the License at
10   *
11   *      http://www.apache.org/licenses/LICENSE-2.0
12   *
13   *  Unless required by applicable law or agreed to in writing, software
14   *  distributed under the License is distributed on an "AS IS" BASIS,
15   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   *  See the License for the specific language governing permissions and
17   *  limitations under the License.
18   */
19  package org.archive.wayback.accesscontrol.robotstxt;
20  
21  import java.io.Serializable;
22  import java.util.concurrent.ConcurrentSkipListSet;
23  
24  /**
25   * Represents the directives that apply to a user-agent (or set of
26   * user-agents)
27   */
28  public class RobotsDirectives implements Serializable {
29      private static final long serialVersionUID = 5386542759286155383L;
30      
31      ConcurrentSkipListSet<String> disallows = new ConcurrentSkipListSet<String>();
32      ConcurrentSkipListSet<String> allows = new ConcurrentSkipListSet<String>();
33      float crawlDelay = -1; 
34  
35      public boolean allows(String path) {
36          return !(longestPrefixLength(disallows, path) > longestPrefixLength(allows, path));
37      }
38  
39      /**
40       * @param prefixSet
41       * @param str
42       * @return length of longest entry in {@code prefixSet} that prefixes {@code str}, or zero
43       *         if no entry prefixes {@code str}
44       */
45      protected int longestPrefixLength(ConcurrentSkipListSet<String> prefixSet,
46              String str) {
47          String possiblePrefix = prefixSet.floor(str);
48          if (possiblePrefix != null && str.startsWith(possiblePrefix)) {
49              return possiblePrefix.length();
50          } else {
51              return 0;
52          }
53      }
54  
55      public void addDisallow(String path) {
56          if(path.length()==0) {
57              // ignore empty-string disallows 
58              // (they really mean allow, when alone)
59              return;
60          }
61          disallows.add(path);
62      }
63  
64      public void addAllow(String path) {
65          allows.add(path);
66      }
67  
68      public void setCrawlDelay(float i) {
69          crawlDelay=i;
70      }
71  
72      public float getCrawlDelay() {
73          return crawlDelay;
74      }
75  }