HXT/Practical/WebSpider

From HaskellWiki
< HXT‎ | Practical
Revision as of 17:57, 24 January 2011 by UweSchmidt (talk | contribs) (updated for HXT-9)

Jump to: navigation, search

module Main where import Data.List import Data.Maybe import qualified Data.Set as Set import System import Network.HTTP import Network.URI import Text.XML.HXT.Core import Text.XML.HXT.Curl type MyArrow b c = IOStateArrow (Set.Set String) b c main = do [url] <- getArgs lines <- runUrl url mapM_ putStrLn lines split = arr (\x -> (x,x)) {- runUrl takes a seed URL and starts spidering from there, returning a - list of validation or other errors. -} runUrl :: String -> IO [String] runUrl url = runX (constA url >>> setTraceLevel 0 >>> withOtherUserState Set.empty (split >>> checkUrl >. unlines >>> perform (getUserState >>> Set.size ^>> (trace 0 $ arr (\x -> "Checked " ++ show x ++ " urls"))) ) ) {- checkUrl is an arrow taking as input a pair (url, base), and producing - a list of errors encountered. The url could be relative to the base. - This arrow will recursively check additional urls encountered in local - URLs. Remote URLs are only checked for status. -} checkUrl :: MyArrow (String,String) String checkUrl = clearErrStatus >>> first normalizeUrl >>> ifA (first seenUrl) (fst ^>> traceString 1 ("Skipping (seen already) " ++) >>> none) (first markSeen >>> ifP isLocalHtml validateAndSpiderUrl checkUrlStatus) {- readFromDocument uses the external curl program because of a file - descriptor leak in either Network.HTTP or HXT -} validateAndSpiderUrl :: MyArrow (String, String) String validateAndSpiderUrl = arr (\(x,y) -> (x,x)) >>> first ( traceString 0 ("Validating " ++) >>> readFromDocument [ withCurl []] >>> selectLinks >>> traceString 1 ("Found link: " ++) ) -- >>> arr (\(x,y) -> ((x,y),y)) >>> first expandURI >>> (this &&& arr snd) >>> first expandURI >>> checkUrl seenUrl :: MyArrow String String seenUrl = split >>> second getUserState >>> (uncurry Set.member) `guardsP` (arr fst) markSeen :: MyArrow String String markSeen = changeUserState Set.insert normalizeUrl = arrL (maybeToList . removeFragment) where removeFragment u = do uri <- parseURIReference u return $ show uri { uriFragment = ""} selectLinks :: ArrowXml a => a XmlTree String selectLinks = deep (isElem >>> hasName "a" >>> getAttrValue "href" >>> mkText) >>> getText {- Note that we already expanded any relative URLs -} isLocalHtml :: (String, String) -> Bool isLocalHtml (url, base) = haveSameHost url base && isHtmlUrl url where haveSameHost a b = fromMaybe False (do urlA <- parseURI a urlB <- parseURI b authA <- uriAuthority urlA authB <- uriAuthority urlB return $ uriRegName authA == uriRegName authB ) isHtmlUrl url = isSuffixOf ".html" url || isSuffixOf "/" url {- Checks the status of a url and returns an error message if anything - other than a 200 OK response results. -} checkUrlStatus :: MyArrow (String,String) String checkUrlStatus = first (traceString 0 ("Checking status of " ++) >>> arrIO (responseCode) >>> arrL (maybeToList)) >>> arr (\(u,b) -> u ++ " (linked from " ++ b ++ ")") responseCode :: String -> IO (Maybe String) responseCode url = case parseURIReference url of Nothing -> return $ Just ("Bad URL: " ++ url) Just uri -> if uriScheme uri == "http:" then catch (responseCode' uri) (\e -> return $ Just (show uri ++ ": " ++ show e)) else return $ Nothing where responseCode' uri = -- HEAD would be sufficient except that some sites -- (like Amazon) disallow it :-( do result <- simpleHTTP $ Request uri GET [] "" return $ either (Just . ((show uri ++ ": ") ++) . show) (responseMessage uri) result responseMessage uri response = case classifyResponse response of Success -> Nothing Error reason -> Just (show uri ++ ": " ++ reason) Moved loc -> Just (show uri ++ ": moved to " ++ (fromMaybe "unknown location" loc)) -- Network.HTTP v3000.0.0 forgot to expose this type type ResponseCode = (Int, Int, Int) data HttpResponseType = Success | Moved (Maybe String) -- new location | Error String -- reason classifyResponse :: Response -> HttpResponseType classifyResponse response = case rspCode response of (2,0,0) -> Success (3,0,2) -> Success -- Found (3,0,7) -> Success -- Temporary Redirect (3,0,1) -> Moved $ findHeader HdrLocation response _ -> Error $ rspReason response