Robotstxt

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual
 *  contributors.
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.accesscontrol.robotstxt;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;

/**
 * Utility class for parsing and representing 'robots.txt' format
 * directives, into a list of named user-agents and map from user-agents
 * to RobotsDirectives.
 */
public class Robotstxt implements Serializable {
    static final long serialVersionUID = 7025386509301303890L;
    private static final Logger logger =
        Logger.getLogger(Robotstxt.class.getName());

    // all user agents contained in this robots.txt
    // in order of declaration
    // TODO: consider discarding irrelevant entries
    LinkedList<String> namedUserAgents = new LinkedList<String>();
    // map user-agents to directives
    Map<String,RobotsDirectives> agentsToDirectives =
        new HashMap<String,RobotsDirectives>();
    RobotsDirectives wildcardDirectives = null;

    boolean hasErrors = false;

    static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
    /** empty, reusable instance for all sites providing no rules */
    public static Robotstxt NO_ROBOTS = new Robotstxt();

    public Robotstxt() {
    }

    public Robotstxt(BufferedReader reader) throws IOException {
        initializeFromReader(reader);
    }

    public Robotstxt(ReadSource customRobots) {
        BufferedReader reader = new BufferedReader(customRobots.obtainReader());
        try {
            initializeFromReader(reader);
        } catch (IOException e) {
            logger.log(Level.SEVERE,
                    "robots ReadSource problem: potential for inadvertent overcrawling",
                    e);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    protected void initializeFromReader(BufferedReader reader) throws IOException {
        String read;
        // current is the disallowed paths for the preceding User-Agent(s)
        RobotsDirectives current = null;
        // whether a non-'User-Agent' directive has been encountered
        boolean hasDirectivesYet = false;
        while (reader != null) {
            do {
                read = reader.readLine();
                // Skip comments & blanks
            } while ((read != null) && ((read = read.trim()).startsWith("#") ||
                read.length() == 0));
            if (read == null) {
                reader.close();
                reader = null;
            } else {
                // remove any html markup
                read = read.replaceAll("<[^>]+>","");
                int commentIndex = read.indexOf("#");
                if (commentIndex > -1) {
                    // Strip trailing comment
                    read = read.substring(0, commentIndex);
                }
                read = read.trim();
                if (read.matches("(?i)^User-agent:.*")) {
                    String ua = read.substring(11).trim().toLowerCase();
                    if (current == null || hasDirectivesYet ) {
                        // only create new rules-list if necessary
                        // otherwise share with previous user-agent
                        current = new RobotsDirectives();
                        hasDirectivesYet = false;
                    }
                    if (ua.equals("*")) {
                        wildcardDirectives = current;
                    } else {
                        namedUserAgents.addLast(ua);
                        agentsToDirectives.put(ua, current);
                    }
                    continue;
                }
                if (read.matches("(?i)Disallow:.*")) {
                    if (current == null) {
                        // buggy robots.txt
                        hasErrors = true;
                        continue;
                    }
                    String path = read.substring(9).trim();
                    // tolerate common error of ending path with '*' character
                    // (not allowed by original spec; redundant but harmless with
                    // Google's wildcarding extensions -- which we don't yet fully
                    // support).
                    if(path.endsWith("*")) {
                        path = path.substring(0,path.length()-1);
                    }
                    current.addDisallow(path);
                    hasDirectivesYet = true;
                    continue;
                }
                if (read.matches("(?i)Crawl-delay:.*")) {
                    if (current == null) {
                        // buggy robots.txt
                        hasErrors = true;
                        continue;
                    }
                    // consider a crawl-delay, even though we don't
                    // yet understand it, as sufficient to end a
                    // grouping of User-Agent lines
                    hasDirectivesYet = true;
                    String val = read.substring(12).trim();
                    val = val.split("[^\\d\\.]+")[0];
                    try {
                        current.setCrawlDelay(Float.parseFloat(val));
                    } catch (NumberFormatException nfe) {
                        // ignore
                    }
                    continue;
                }
                if (read.matches("(?i)Allow:.*")) {
                    if (current == null) {
                        // buggy robots.txt
                        hasErrors = true;
                        continue;
                    }
                    String path = read.substring(6).trim();
                    // tolerate common error of ending path with '*' character
                    // (not allowed by original spec; redundant but harmless with
                    // Google's wildcarding extensions -- which we don't yet fully
                    // support).
                    if(path.endsWith("*")) {
                        path = path.substring(0,path.length()-1);
                    }
                    current.addAllow(path);
                    hasDirectivesYet = true;
                    continue;
                }
                // unknown line; do nothing for now
            }
        }
    }

    /**
     * Does this policy effectively allow everything? (No
     * disallows or timing (crawl-delay) directives?)
     * @return <b>true</b> if the map containing user-agents to directives is empty, <b>false</b> otherwise.
     */
    public boolean allowsAll() {
        // TODO: refine so directives that are all empty are also
        // recognized as allowing all
        return agentsToDirectives.isEmpty();
    }

    public List<String> getNamedUserAgents() {
        return namedUserAgents;
    }

    /**
     * Return the RobotsDirectives, if any, appropriate for the given User-Agent
     * string. If useFallbacks is true, a wildcard ('*') directives or the default
     * of NO_DIRECTIVES will be returned, as appropriate, if there is no better
     * match. If useFallbacks is false, a null will be returned if no declared
     * directives targeted the given User-Agent.
     *
     * @param ua String User-Agent to lookup
     * @param useFallbacks if true, fall-back to wildcard directives or
     * default allow as needed
     * @return directives to use, or null if useFallbacks is false and no
     * non-wildcard directives match the supplied User-Agent
     */
    public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) {
        // find matching ua
        for(String uaListed : namedUserAgents) {
            if(ua.indexOf(uaListed)>-1) {
                return agentsToDirectives.get(uaListed);
            }
        }
        if(useFallbacks==false) {
            return null;
        }
        if (wildcardDirectives!=null) {
            return wildcardDirectives;
        }
        // no applicable user-agents, so empty directives
        return NO_DIRECTIVES;
    }

    /**
     * Return directives to use for the given User-Agent, resorting to wildcard
     * rules or the default no-directives if necessary.
     *
     * @param userAgent String User-Agent to lookup
     * @return directives to use
     */
    public RobotsDirectives getDirectivesFor(String userAgent) {
        return getDirectivesFor(userAgent, true);
    }
}