HXT/Practical/Google1: Difference between revisions

From HaskellWiki
< HXT‎ | Practical
(dealing with HTML)
 
(Use block markup for multiline code)
 
(3 intermediate revisions by 3 users not shown)
Line 1: Line 1:
<hask>
<haskell>
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import Text.XML.HXT.Arrow
import Text.XML.HXT.Core
import Text.Printf
import Text.Printf
import Network.URI
import Network.URI
Line 8: Line 8:
import Data.Char
import Data.Char
import System.Environment
import System.Environment
import Control.Monad (forM_)


-- Search Google and retrieve the results into some
-- Search Google and retrieve the results into some
Line 28: Line 29:
-- so I've poked around and came up with this:
-- so I've poked around and came up with this:
-- look for  
-- look for  
--  <div id="res"> ... <h2> ... <a onmousedown="..." ...>
--  <h3> ... <a href="..." ...>
-- and the results will be contained within the anchor.
-- and the results will be contained within the anchor.


selectGoogleResults =  
selectGoogleResults =  
   atTagCase "div" >>> hasAttrValue "id" (=="res")
   atTagCase "h3"
  >>> atTagCase "h2"
   >>> atTagCase "a"
   >>> atTagCase "a" >>> hasAttrValue "onmousedown" (not . null)
   >>> (getAttrValue "href" &&&  
   >>> (getAttrValue "href" &&&  
       -- The title may be broken up into multiple text nodes.
       -- The title may be broken up into multiple text nodes.
Line 45: Line 45:
selectGoogleResultsToXML =  
selectGoogleResultsToXML =  
   selem "results"  
   selem "results"  
     [ atTagCase "div" >>> hasAttrValue "id" (=="res")
     [ atTagCase "h3"
      >>> atTagCase "h2"
       >>> atTagCase "a"
       >>> atTagCase "a" >>> hasAttrValue "onmousedown" (not . null)
       >>> selem "result"  
       >>> selem "result"  
             [ selem "url" [getAttrValue "href" >>> mkText]
             [ selem "url" [getAttrValue "href" >>> mkText]
Line 65: Line 64:


selectGoogleResults' =  
selectGoogleResults' =  
   atTagCase "div" >>> hasAttrValue "id" (=="res")
   atTagCase "h3"
  >>> atTagCase "h2"
   >>> atTagCase "a"
   >>> atTagCase "a" >>> hasAttrValue "onmousedown" (not . null)
   >>> proc r -> do
   >>> proc r -> do
         url  <- getAttrValue "href"            -< r
         url  <- getAttrValue "href"            -< r
Line 74: Line 72:


selectGoogleResultsToXML' = proc x -> do
selectGoogleResultsToXML' = proc x -> do
   res <- listA (atTagCase "div" >>> hasAttrValue "id" (=="res")
   res <- listA (atTagCase "h3"
                >>> atTagCase "h2"
                 >>> atTagCase "a"
                 >>> atTagCase "a" >>> hasAttrValue "onmousedown" (not . null)
                 >>> selectResult) -< x
                 >>> selectResult) -< x
   selem "results" (map constA res) -<< ()
   selem "results" (map constA res) -<< ()
Line 112: Line 109:
         upper = map toUpper
         upper = map toUpper


parseHTML = readString [(a_validate,v_0),(a_parse_html,v_1),(a_issue_warnings,v_0)]
parseHTML = readString [ withValidate no
                      , withParseHTML yes
                      , withWarnings no
                      ]


-- Pretend to be a user of Mozilla Firefox, because Google
-- Pretend to be a user of Mozilla Firefox, because Google
Line 136: Line 136:
       -- tuple version
       -- tuple version
       links <- runX (parseHTML body >>> selectGoogleResults)
       links <- runX (parseHTML body >>> selectGoogleResults)
       flip mapM_ links $ \ (url,title) -> printf "%s <url:%s>\n" title url
       forM_ links $ \ (url,title) -> printf "%s <url:%s>\n" title url


       -- XML version
       -- XML version
Line 143: Line 143:
                     -- which encompasses all the top-level siblings.
                     -- which encompasses all the top-level siblings.
                     root [] [selectGoogleResultsToXML] >>>  
                     root [] [selectGoogleResultsToXML] >>>  
                     writeDocumentToString [(a_indent,v_1)])
                     writeDocumentToString [withIndent yes])
       putStrLn xml
       putStrLn xml
</hask>
</haskell>

Latest revision as of 17:08, 11 October 2011

{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import Text.XML.HXT.Core
import Text.Printf
import Network.URI
import Network.HTTP
import Data.List
import Data.Char
import System.Environment
import Control.Monad (forM_)

-- Search Google and retrieve the results into some
-- nicer data structure.  This is an exercise in
-- munging the HTML output of Google.

-- A number of possible functions are demonstrated,
-- to show some different ways of going about the task,
-- only one is really needed.

-- Personally, I favor the 2nd version, selectGoogleResultsToXML.  
-- I think this is a good demonstration of a case where the 
-- ordinary Haskell combinators are clearer than the special 
-- Arrow syntax.

--------------------------------------------------
-- First example: Select pairs of (url, title)

-- There isn't any help, from Google, in picking out the results,
-- so I've poked around and came up with this:
-- look for 
--   <h3> ... <a href="..." ...>
-- and the results will be contained within the anchor.

selectGoogleResults = 
  atTagCase "h3"
  >>> atTagCase "a"
  >>> (getAttrValue "href" &&& 
       -- The title may be broken up into multiple text nodes.
       -- So, we collect it as a list and then lift 'concat' 
       -- to combine it.
       (listA (deep isText >>> getText) >>> arr concat))

-- Or, select the data into a simpler XML document

selectGoogleResultsToXML = 
  selem "results" 
    [ atTagCase "h3"
      >>> atTagCase "a"
      >>> selem "result" 
            [ selem "url" [getAttrValue "href" >>> mkText]
            , selem "title" [deep isText] ] ] 

-- The above function shows construction of XML,
-- 
-- selem tag body = mkelem tag [] body
--   constructs an element without attributes.
--
-- Note that 'body' is a list containing XmlTree arrows,
-- that is why you can mix selection and construction.

--------------------------------------------------
-- Alternative versions, using the special Arrow 
-- syntax.

selectGoogleResults' = 
  atTagCase "h3"
  >>> atTagCase "a"
  >>> proc r -> do
        url   <- getAttrValue "href"             -< r
        title <- listA (getText <<< deep isText) -< r
        returnA -< (url, concat title)

selectGoogleResultsToXML' = proc x -> do
  res <- listA (atTagCase "h3"
                >>> atTagCase "a"
                >>> selectResult) -< x
  selem "results" (map constA res) -<< ()
  where
    selectResult = proc r -> do
      url   <- getAttrValue "href"             -< r
      title <- listA (getText <<< deep isText) -< r
      selem "result" 
        [ selem "url" [txt url]
        , selem "title" [txt (concat title)] ]  -<< ()

-- This last example introduces some new syntax,
-- namely -<<.  According to the GHC Arrow docs,
-- you cannot use locally bound variables on the
-- left-hand side of -<.  In brief, the -<< syntax is
-- a variation which permits this, while introducing
-- a dependency on the ArrowApply class.

-- The reason why 'selem' is on the LHS in the
-- first place is because, if you check the type
-- of 'selem', you will see that it is already
-- in the Arrow (so to speak).  Injecting it
-- with 'returnA' would simply create an extra
-- layer of Arrow that is not wanted.

-- Since 'selem' is not processing anything, the RHS 
-- of -<< in this case is simply ().

googleURLFormat = "http://www.google.com/search?%s"
constructGoogleURL q = printf googleURLFormat $ urlEncodeVars [("q",q)]

-- case-insensitive tag matching
atTagCase tag = deep (isElem >>> hasNameWith ((== tag') . upper . localPart))
  where tag' = upper tag
        upper = map toUpper

parseHTML = readString [ withValidate no
                       , withParseHTML yes
                       , withWarnings no
                       ]

-- Pretend to be a user of Mozilla Firefox, because Google
-- will not display results for unknown user agents.

userAgent = "Mozilla/5.0 (en-US) Firefox/2.0.0.6667" 

get :: URI -> IO String
get uri = do
  let req = Request uri GET [] ""
  eresp <- simpleHTTP $ insertHeader HdrUserAgent userAgent req
  case eresp of
    Left er -> error $ show er
    Right res -> return $ rspBody res

main = do
  args <- getArgs
  case parseURI (constructGoogleURL (unwords args)) of
    Nothing -> putStrLn "Invalid search"
    Just uri -> do
      body  <- get uri

      -- tuple version
      links <- runX (parseHTML body >>> selectGoogleResults)
      forM_ links $ \ (url,title) -> printf "%s <url:%s>\n" title url

      -- XML version
      [xml] <- runX (parseHTML body >>> 
                     -- At the top of a document is a hidden "root" node
                     -- which encompasses all the top-level siblings.
                     root [] [selectGoogleResultsToXML] >>> 
                     writeDocumentToString [withIndent yes])
      putStrLn xml