Syslog Parserを haskellで書いてみる。

過去にBisonErlangYeccを使ってパーサーを書いたことがあるのですが、作成数ヶ月後にソースコードを読んでみても何をしているのかさっぱり忘れてしまい苦労したことがあります。
おまけにデバッグ大変ですし・・・・

今回はHaskell のattoparsecライブラリを使ってRFC3164のパーサーを書いてみました。

Real world haskellChapter 16. Using Parsecではattoparsecのベースとなったparsecの説明があるので参考になると思います。

Real world haskellChapter 27. Sockets and Syslogでもsyslog serverの実装方法が取り上げていますが、ただ単にsyslog serverが受信したデータをputStrLnで表示しているだけなので、少々素っ気ないです。

RFC3164詳細はこちら
http://www.ietf.org/rfc/rfc3164.txt

まずはReal world haskellの- file: ch27/SyslogTypes.hs を少し変更

module SyslogPriority 
	(
	  Severity
	, Facility
	, codeOfFac
	, facOfCode
	, codeOfPri
	, priOfCode
	) where

data Severity = DEBUG
              | INFO
              | NOTICE
              | WARNING
              | ERROR
              | CRITICAL
              | ALART
              | EMERGENCY
                deriving (Show, Eq, Read, Enum, Ord)

priToCode :: [(Severity, Int)]
priToCode = [
             (DEBUG, 7),
             (INFO, 6),
             (NOTICE, 5),
             (WARNING, 4),
             (ERROR, 3),
             (CRITICAL, 2),
             (ALART, 1),
             (EMERGENCY, 0)
            ]
    
codeOfPri :: Severity -> Int
codeOfPri p = case lookup p priToCode of
                Just x -> x
                _ -> error $ "Internal error in priOfCode"

priOfCode :: Int -> Severity
priOfCode p = case lookup p $ codeTo priToCode of
                Just x -> x
                _ -> error $ "Internal error in codeOfPri"
                 
data Facility = KERN
              | USER
              | MAIL
              | DAEMON
              | AUTH
              | SYSLOG
              | LPR
              | NEWS
              | UUCP
              | CRON
              | AUTHPRIV
              | FTP
              | NTP
              | LOGAUDIT
              | LOGWARNING
              | CLOCKDAEMON
              | LOCAL0
              | LOCAL1
              | LOCAL2
              | LOCAL3
              | LOCAL4
              | LOCAL5
              | LOCAL6
              | LOCAL7
                deriving (Show, Eq, Ord)
                
facToCode :: [(Facility, Int)]
facToCode = [
             (KERN, 0),
             (USER, 1),
             (MAIL, 2),
             (DAEMON, 3),
             (AUTH, 4),
             (SYSLOG, 5),
             (LPR, 6),
             (NEWS, 7),
             (UUCP, 8),
             (CRON, 9),
             (AUTHPRIV, 10),
             (FTP, 11),
             (NTP, 12),
             (LOGAUDIT, 13),
             (LOGWARNING, 14),
             (CLOCKDAEMON, 15),
             (LOCAL0, 16),
             (LOCAL1, 17),
             (LOCAL2, 18),
             (LOCAL3, 19),
             (LOCAL4, 20),
             (LOCAL5, 21),
             (LOCAL6, 22),
             (LOCAL7, 23)
            ]

codeOfFac :: Facility -> Int
codeOfFac f = case lookup f facToCode of
                Just x -> x
                _ -> error $ "Internal error in codeOfFac"

facOfCode :: Int -> Facility
facOfCode f = case lookup f $ codeTo facToCode of
                Just x -> x
                _ -> error $ "Internal error in facOfCode"

codeTo :: [(x, y)] -> [(y, x)]
codeTo = map (\(x, y) -> (y, x))

そしてパーサー RFC3164.hs

{-# LANGUAGE OverloadedStrings #-}

module RFC3164 
    (
      parseRFC3164
    , RFC3164(..)
    , PRI(..)
    , Header(..)
    , Msg(..)
    ) where

import SyslogPriority
import Control.Applicative
import Data.Attoparsec as P
import qualified Data.Attoparsec.Char8 as P8
import Data.Attoparsec.Char8 (char8, endOfLine)
import qualified Data.ByteString.Char8 as B hiding (map)


{-
http://www.ietf.org/rfc/rfc3164.txt
-}

-- SYSLOG-MSG      = PRI SP HEADER SP MSG
data RFC3164 = RFC3164
    {
      pri :: PRI
    , header :: Header
    , msg :: Msg
    } deriving (Eq, Ord, Show)

-- | parser for rfc 3164
-- >>> :set -XOverloadedStrings 
-- >>> parseTest parseRFC3164 "<34>Oct 11 22:14:15 mymachine su: 'su root' failed for lonvick on /dev/pts/8\n "
-- Done " " RFC3164 {pri = PRI {severity = CRITICAL, facility = AUTH}, header = Header {timestamp = "Oct 11 22:14:15", hostName = "mymachine"}, msg = Msg {tag = Just "su", procId = Nothing, content = " 'su root' failed for lonvick on /dev/pts/8"}}
--
-- >>> parseTest parseRFC3164 "<0>Oct 22 10:52:12 scapegoat apache[2321]: 10.1.2.3 sched[0]: That's All Folks!\n "
-- Done " " RFC3164 {pri = PRI {severity = EMERGENCY, facility = KERN}, header = Header {timestamp = "Oct 22 10:52:12", hostName = "scapegoat"}, msg = Msg {tag = Just "apache", procId = Just 2321, content = " 10.1.2.3 sched[0]: That's All Folks!"}}
--
parseRFC3164 :: Parser RFC3164
parseRFC3164 = do
    pri' <- parsePRI
    header' <- parseHeader
    msg' <-  parseMsg
    return $ RFC3164 pri' header' msg'
 
-- PRI = '<' INT '>'
-- INT = 1*3 DIGIT ; range 0 .. 191
--
data PRI = PRI {
	  severity :: Severity
	, facility :: Facility
	} deriving (Eq, Ord, Show) 

-- | facrity and priority check
--
-- Example:
--
-- >>> parseTest parsePRI "<0> "
-- Done " " PRI {severity = EMERGENCY, facility = KERN}
-- 
-- >>> parseTest parsePRI "<191> "
-- Done " " PRI {severity = DEBUG, facility = LOCAL7}
--

parsePRI :: Parser PRI  
parsePRI = do
    pri' <- char8 '<' *> P8.decimal <* char8 '>'
    return $ PRI (parseSeverity pri') (parseFac pri') 

parseFac :: Int -> Facility
parseFac pri' = facOfCode $  pri' `div` 8
    
parseSeverity :: Int -> Severity
parseSeverity pri' = priOfCode $  pri' `mod` 8

-- HEADER          = TIMESTAMP SP HOSTNAME
-- TIMESTAMP = MMM SP DD SP HH COL MM COL SS
-- MMM       = 3PRINTUSASCII  ; Jan Feb Mar Arp May Jun Jul Aug Sep Oct Nov Dev
-- DD        = 2DIGIT  ; <SP>1-28, <SP>1-29, <SP>1-30, <SP>1-31
-- HH        = 2DIGIT  ; 00-23
-- MM        = 2DIGIT  ; 00-59
-- HOSTNAME  =  IPv4 | IPv6 | PRINTUSASCII/DIGIT
-- IPv4      = 3NONZERO-DIGIT DOT 3NONZERO-DIGIT DOT 3NONZERO-DIGIT
-- IPv6     = 

data Header = Header 
    {
       timestamp :: !B.ByteString
     , hostName :: !B.ByteString
     } deriving (Show, Eq, Ord)

-- | header parser check
--
-- example:
--
-- >>> parseTest parseHeader "Feb 12 19:27:01 hostname "
-- Done "" Header {timestamp = "Feb 12 19:27:01", hostName = "hostname"}
--
-- >>> parseTest parseHeader "Feb  2 19:27:01 hostname "
-- Done "" Header {timestamp = "Feb  2 19:27:01", hostName = "hostname"}
--
parseHeader :: Parser Header
parseHeader = do
    time <- P8.take 15 <* char8 ' ' 
    host <- takeTill P8.isHorizontalSpace <* char8 ' '
    return $ Header time host
    
data Msg = Msg {
	  tag :: Maybe B.ByteString
	, procId :: Maybe Int
	, content :: !B.ByteString
	} deriving (Eq, Ord, Show)

--MSG             = TAG COL CONTENT
--TAG		  = 1*32 PROCNAME [ PID ]:
--PROCNAME	  = PRINTUSASCII / DIGIT
--TAGID           = DIGIT  
--CONTENT         = PRINTUSASCII
--SP              = %d32
--PRINTUSASCII    = %d33-126
--NONZERO-DIGIT   = %d49-57
--DIGIT           = %d48 / NONZERO-DIGIT
--DOT		  = %46
--COL		  = %58
--[		  = %91
--]		  = %93

-- | msg parser check
--
-- example:
--
-- >>> parseTest parseMsg "app[12]: message starting \n" -- "12]: message starting \n"
-- Done "" Msg {tag = Just "app", procId = Just 12, content = " message starting "}
--
-- >>> parseTest parseMsg "app: message starting \n" --"message starting \n" 
-- Done "" Msg {tag = Just "app", procId = Nothing, content = " message starting "}
--
--

parseMsg :: Parser Msg
parseMsg = do
    tag' <- Just <$> try (takeTill (\c -> inClass "[:" c)) <|> pure Nothing <$> char8 '['
    pid' <- Just <$> try (char8 '[' *> P8.decimal <* char8 ']' <* char8 ':') <|> pure Nothing <$> char8 ':'
    content' <- takeTill P8.isEndOfLine <* endOfLine
    return $ Msg tag' pid' content'

doctest(ghc7.4.2)で動作を確認
※doctestがない場合は "cabal install doctest"でインストールください

$ doctest RFC3164.hs 
There are 4 tests, with 9 total interactions.
Examples: 4  Tried: 4  Errors: 0  Failures: 0

network-conduitattoparsec-conduitを参考にsyslog server(collector)を実装してみるとおもしろいかもしれません。