examples/re-nginx-log-processor.lhs

Example: NGINX Log Processor

This example program reads lines from NGINX error-log files and access-log files converts them into a unified output format.

It is an example of developing REs at scale using macros with the regex test bench.

The tool is self-testing: run it with no arguments (or cabal test).

{-# LANGUAGE NoImplicitPrelude          #-}
{-# LANGUAGE RecordWildCards            #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE TemplateHaskell            #-}
{-# LANGUAGE QuasiQuotes                #-}
{-# LANGUAGE OverloadedStrings          #-}

module Main
  ( main
  -- development
  , parse_a
  , parse_e
  ) where

import           Control.Applicative
import           Control.Monad
import qualified Data.ByteString.Lazy.Char8               as LBS
import           Data.Char
import           Data.Functor.Identity
import qualified Data.HashMap.Lazy                        as HML
import           Data.Maybe
import           Data.String
import qualified Data.Text                                as T
import           Data.Time
import           Prelude.Compat
import           System.Directory
import           System.Environment
import           System.Exit
import           System.FilePath
import           System.IO
import           TestKit
import           Text.Printf
import qualified Text.RE.PCRE                             as PCRE
import           Text.RE.PCRE.ByteString.Lazy
import qualified Text.RE.PCRE.String                      as S
import           Text.RE.REOptions
import           Text.RE.Replace
import           Text.RE.TestBench
import           Text.RE.Tools.Sed
main :: IO ()
main = do
  as  <- getArgs
  case as of
    ["--macro"      ] -> putStr      lp_macro_table
    ["--macro",mid_s] -> putStrLn  $ lp_macro_summary $ MacroID mid_s
    ["--regex"      ] -> putStr      lp_macro_sources
    ["--regex",mid_s] -> putStrLn  $ lp_macro_source  $ MacroID mid_s
    ["--test"       ] -> test
    []                -> test
    [in_file          ] | is_file in_file -> go True  in_file "-"
    [in_file,out_file ] | is_file in_file -> go True  in_file out_file
    _                                     -> usage
  where
    is_file = not . (== "--") . take 2

    usage = do
      pnm <- getProgName
      let prg = ((pnm++" ")++)
      putStr $ unlines
        [ "usage:"
        , prg " --help"
        , prg " --macro"
        , prg " --macro <macro-id>"
        , prg " --regex"
        , prg " --regex <macro-id>"
        , prg "[--test]"
        , prg "(-|<in-file>) [-|<out-file>]"
        ]

--
-- go
--

test :: IO ()
test = do
    putStrLn "============================================================"
    putStrLn "Testing the macro environment."
    putStrLn "nginx-log-processor"
    is_docs <- doesDirectoryExist "docs"
    when is_docs $
      dumpMacroTable  (fp "docs" ".txt") (fp "docs" "-src.txt") regexType lp_env
    dumpMacroTable    (fp "data" ".txt") (fp "data" "-src.txt") regexType lp_env
    me_ok <- testMacroEnv "nginx-log-processor" regexType lp_env
    putStrLn "============================================================"
    putStrLn "Testing the log processor on reference data."
    putStrLn ""
    lp_ok <- test_log_processor
    putStrLn "============================================================"
    case me_ok && lp_ok of
      True  -> return ()
      False -> exitWith $ ExitFailure 1
  where
    fp dir sfx = dir </> (rty_s ++ "-nginx-log-processor" ++ sfx)

    rty_s      = map toLower $ presentRegexType regexType

test_log_processor :: IO Bool
test_log_processor = do
    createDirectoryIfMissing False "tmp"
    go False "data/access-errors.log" "tmp/events.log"
    cmp "tmp/events.log" "data/events.log"

--
-- go
--

go :: Bool -> FilePath -> FilePath -> IO ()
go rprt_flg in_file out_file = do
  ctx <- setup rprt_flg
  sed (script ctx) in_file out_file
script :: Ctx -> Edits IO RE LBS.ByteString
script ctx = Select
    [ on [re_|@{access}|]     ACC parse_access
    , on [re_|@{access_deg}|] AQQ parse_deg_access
    , on [re_|@{error}|]      ERR parse_error
    , on [re_|.*|]            QQQ parse_def
    ]
  where
    on rex src prs = Function (rex lpo) TOP $ process_line ctx src prs

    parse_def      = fmap capturedText . matchCapture
process_line :: IsEvent a
             => Ctx
             -> Source
             -> (Match LBS.ByteString->Maybe a)
             -> LineNo
             -> Match LBS.ByteString
             -> RELocation
             -> Capture LBS.ByteString
             -> IO (Maybe LBS.ByteString)
process_line ctx src prs lno cs _ _ = do
    when (event_is_notifiable event) $
      flag_event ctx event
    return $ Just $ presentEvent event
  where
    event     = maybe def_event (mkEvent lno src) $ prs cs

    def_event =
      Event
        { _event_line     = lno
        , _event_source   = src
        , _event_utc      = read "1970-01-01 00:00:00"
        , _event_severity = Nothing
        , _event_address  = (0,0,0,0)
        , _event_details  = ""
        }


--
-- Ctx, setup, event_is_notifiable, flag_event
--

type Ctx = Bool

setup :: Bool -> IO Ctx
setup = return

event_is_notifiable :: Event -> Bool
event_is_notifiable Event{..} =
  fromEnum (fromMaybe Debug _event_severity) <= fromEnum Err

flag_event :: Ctx -> Event -> IO ()
flag_event False = const $ return ()
flag_event True  = LBS.hPutStrLn stderr . presentEvent


--
-- Event, presentEvent, IsEvent
--

data Event =
  Event
    { _event_line     :: LineNo
    , _event_source   :: Source
    , _event_utc      :: UTCTime
    , _event_severity :: Maybe Severity
    , _event_address  :: IPV4Address
    , _event_details  :: LBS.ByteString
    }
  deriving (Show)

data Source = ACC | AQQ | ERR | QQQ
  deriving (Show,Read)

presentEvent :: Event -> LBS.ByteString
presentEvent Event{..} = LBS.pack $
  printf "%04d %s %s %-7s %3d.%3d.%3d.%3d [%s]"
    (getLineNo                _event_line    )
    (show                     _event_source  )
    (show                     _event_utc     )
    (maybe "-" svrty_kw       _event_severity)
                              a b c d
    (LBS.unpack               _event_details )
  where
    (a,b,c,d) = _event_address

    svrty_kw  = T.unpack . fst . severityKeywords

class IsEvent a where
  mkEvent :: LineNo -> Source -> a -> Event

instance IsEvent Access where
  mkEvent lno src Access{..} =
    Event
      { _event_line     = lno
      , _event_source   = src
      , _event_utc      = _a_time_local
      , _event_severity = Nothing
      , _event_address  = _a_remote_addr
      , _event_details  = LBS.pack $
          printf "%s %d %d %s %s %s"
            (T.unpack _a_request        )
                      _a_status
                      _a_body_bytes
            (T.unpack _a_http_referrer  )
            (T.unpack _a_http_user_agent)
            (T.unpack _a_other          )
      }

instance IsEvent Error where
  mkEvent lno src ERROR{..} =
    Event
      { _event_line     = lno
      , _event_source   = src
      , _event_utc      = UTCTime _e_date $ timeOfDayToTime _e_time
      , _event_severity = Just _e_severity
      , _event_address  = (0,0,0,0)
      , _event_details  = LBS.pack $ printf "%d#%d: %s" pid tid $ LBS.unpack _e_other
      }
    where
      (pid,tid) = _e_pid_tid

instance IsEvent LBS.ByteString where
  mkEvent lno src lbs =
    Event
      { _event_line     = lno
      , _event_source   = src
      , _event_utc      = read "1970-01-01 00:00:00"
      , _event_severity = Nothing
      , _event_address  = (0,0,0,0)
      , _event_details  = lbs
      }


--
-- REOptions and Prelude
--

lpo :: PCRE.REOptions
lpo = PCRE.makeREOptions lp_prelude

lp_prelude :: Macros RE
lp_prelude = runIdentity $ mkMacros mk regexType ExclCaptures lp_env
  where
    mk   = maybe oops Identity . PCRE.compileRegexWithOptions PCRE.noPreludeREOptions

    oops = error "lp_prelude"

lp_macro_table :: String
lp_macro_table = formatMacroTable regexType lp_env

lp_macro_summary :: MacroID -> String
lp_macro_summary = formatMacroSummary regexType lp_env

lp_macro_sources :: String
lp_macro_sources = formatMacroSources regexType ExclCaptures lp_env

lp_macro_source :: MacroID -> String
lp_macro_source = formatMacroSource regexType ExclCaptures lp_env

lp_env :: MacroEnv
lp_env = PCRE.preludeEnv `HML.union` HML.fromList
    [ f "user"        user_macro
    , f "pid#tid:"    pid_tid_macro
    , f "access"      access_macro
    , f "access_deg"  access_deg_macro
    , f "error"       error_macro
    ]
  where
    f mid mk = (mid, mk lp_env mid)


--
-- The Macro Descriptors
--

user_macro :: MacroEnv -> MacroID -> MacroDescriptor
user_macro env mid =
  runTests regexType parse_user samples env mid
    MacroDescriptor
      { macroSource          = "(?:-|[^[:space:]]+)"
      , macroSamples         = map fst samples
      , macroCounterSamples = counter_samples
      , macroTestResults    = []
      , macroParser          = Just "parse_user"
      , macroDescription     = "a user ident (per RFC1413)"
      }
  where
    samples :: [(String,User)]
    samples =
        [ f "joe"
        ]
      where
        f nm = (nm,User $ LBS.pack nm)

    counter_samples =
        [ "joe user"
        ]

pid_tid_macro :: MacroEnv -> MacroID -> MacroDescriptor
pid_tid_macro env mid =
  runTests regexType parse_pid_tid samples env mid
    MacroDescriptor
      { macroSource          = "(?:@{%nat})#(?:@{%nat}):"
      , macroSamples         = map fst samples
      , macroCounterSamples = counter_samples
      , macroTestResults    = []
      , macroParser          = Just "parse_pid_tid"
      , macroDescription     = "<PID>#<TID>:"
      }
  where
    samples :: [(String,(Int,Int))]
    samples =
        [ f "1378#0:" (1378,0)
        ]
      where
        f = (,)

    counter_samples =
        [ ""
        , "24#:"
        , "24.365:"
        ]

access_macro :: MacroEnv -> MacroID -> MacroDescriptor
access_macro env mid =
  runTests' regexType (parse_access . fmap LBS.pack) samples env mid
    MacroDescriptor
      { macroSource          = access_re
      , macroSamples         = map fst samples
      , macroCounterSamples = counter_samples
      , macroTestResults    = []
      , macroParser          = Just "parse_a"
      , macroDescription     = "an Nginx access log file line"
      }
  where
    samples :: [(String,Access)]
    samples =
        [ (,) "192.168.100.200 - - [12/Jan/2016:12:08:36 +0000] \"GET / HTTP/1.1\" 200 3700 \"-\" \"My Agent\" \"-\""
            Access
              { _a_remote_addr      = (192,168,100,200)
              , _a_remote_user      = "-"
              , _a_time_local       = read "2016-01-12 12:08:36 UTC"
              , _a_request          = "GET / HTTP/1.1"
              , _a_status           = 200
              , _a_body_bytes       = 3700
              , _a_http_referrer    = "-"
              , _a_http_user_agent  = "My Agent"
              , _a_other            = "-"
              }
        ]

    counter_samples =
        [ ""
        , " -  [] \"\"   \"\" \"\" \"\""
        ]

access_deg_macro :: MacroEnv -> MacroID -> MacroDescriptor
access_deg_macro env mid =
  runTests' regexType (parse_deg_access . fmap LBS.pack) samples env mid
    MacroDescriptor
      { macroSource          = " -  \\[\\] \"\"   \"\" \"\" \"\""
      , macroSamples         = map fst samples
      , macroCounterSamples = counter_samples
      , macroTestResults    = []
      , macroParser          = Nothing
      , macroDescription     = "a degenerate Nginx access log file line"
      }
  where
    samples :: [(String,Access)]
    samples =
        [ (,) " -  [] \"\"   \"\" \"\" \"\"" deg_access
        ]

    counter_samples =
        [ ""
        , "foo"
        ]

error_macro :: MacroEnv -> MacroID -> MacroDescriptor
error_macro env mid =
  runTests' regexType (parse_error . fmap LBS.pack) samples env mid
    MacroDescriptor
      { macroSource          = error_re
      , macroSamples         = map fst samples
      , macroCounterSamples = counter_samples
      , macroTestResults    = []
      , macroParser          = Just "parse_e"
      , macroDescription     = "an Nginx error log file line"
      }
  where
    samples :: [(String,Error)]
    samples =
        [ (,) "2016/12/21 11:53:35 [emerg] 1378#0: foo"
            ERROR
              { _e_date     = read "2016-12-21"
              , _e_time     = read "11:53:35"
              , _e_severity = Emerg
              , _e_pid_tid  = (1378,0)
              , _e_other    = " foo"
              }
        , (,) "2017/01/04 05:40:19 [error] 31623#0: *1861296 no \"ssl_certificate\" is defined in server listening on SSL port while SSL handshaking, client: 192.168.31.38, server: 0.0.0.0:80"
            ERROR
              { _e_date     = read "2017-01-04"
              , _e_time     = read "05:40:19"
              , _e_severity = Err
              , _e_pid_tid  = (31623,0)
              , _e_other    = " *1861296 no \"ssl_certificate\" is defined in server listening on SSL port while SSL handshaking, client: 192.168.31.38, server: 0.0.0.0:80"
              }
        ]

    counter_samples =
        [ ""
        , "foo"
        ]

--
-- Access, access_re, deg_access, parse_deg_access, parse_access
--

data Access =
  Access
    { _a_remote_addr      :: !IPV4Address
    , _a_remote_user      :: !User
    , _a_time_local       :: !UTCTime
    , _a_request          :: !T.Text
    , _a_status           :: !Int
    , _a_body_bytes       :: !Int
    , _a_http_referrer    :: !T.Text
    , _a_http_user_agent  :: !T.Text
    , _a_other            :: !T.Text
    }
  deriving (Eq,Show)
access_re :: RegexSource
access_re = RegexSource $ unwords
  [ "(@{%address.ipv4})"
  , "-"
  , "(@{user})"
  , "\\[(@{%datetime.clf})\\]"
  , "(@{%string.simple})"
  , "(@{%nat})"
  , "(@{%nat})"
  , "(@{%string.simple})"
  , "(@{%string.simple})"
  , "(@{%string.simple})"
  ]
deg_access :: Access
deg_access =
  Access
    { _a_remote_addr      = (0,0,0,0)
    , _a_remote_user      = "-"
    , _a_time_local       = read "1970-01-01 00:00:00"
    , _a_request          = ""
    , _a_status           = 0
    , _a_body_bytes       = 0
    , _a_http_referrer    = ""
    , _a_http_user_agent  = ""
    , _a_other            = ""
    }

parse_deg_access :: Match LBS.ByteString -> Maybe Access
parse_deg_access Match{..} =
  case matchSource == " -  [] \"\"   \"\" \"\" \"\"" of
    True  -> Just deg_access
    False -> Nothing

parse_a :: LBS.ByteString -> Maybe Access
parse_a lbs = parse_access $ lbs ?=~ [re_|@{access}|] lpo

parse_access :: Match LBS.ByteString -> Maybe Access
parse_access cs =
  Access
    <$> f parseIPv4Address  [cp|1|]
    <*> f parse_user        [cp|2|]
    <*> f parseDateTimeCLF  [cp|3|]
    <*> f parseSimpleString [cp|4|]
    <*> f parseInteger      [cp|5|]
    <*> f parseInteger      [cp|6|]
    <*> f parseSimpleString [cp|7|]
    <*> f parseSimpleString [cp|8|]
    <*> f parseSimpleString [cp|9|]
  where
    f psr i = psr $ capturedText $ capture i cs


--
-- Error, error_re, parse_error
--

data Error =
  ERROR
    { _e_date     :: Day
    , _e_time     :: TimeOfDay
    , _e_severity :: Severity
    , _e_pid_tid  :: (Int,Int)
    , _e_other    :: LBS.ByteString
    }
  deriving (Eq,Show)

error_re :: RegexSource
error_re = RegexSource $ unwords
  [ "(@{%date.slashes})"
  , "(@{%time})"
  , "\\[(@{%syslog.severity})\\]"
  , "(@{pid#tid:})(.*)"
  ]

parse_e :: LBS.ByteString -> Maybe Error
parse_e lbs = parse_error $ lbs ?=~ [re_|@{error}|] lpo

parse_error :: Match LBS.ByteString -> Maybe Error
parse_error cs =
  ERROR
    <$> f parseSlashesDate    [cp|1|]
    <*> f parseTimeOfDay      [cp|2|]
    <*> f parseSeverity [cp|3|]
    <*> f parse_pid_tid       [cp|4|]
    <*> f Just                [cp|5|]
  where
    f psr i = psr $ capturedText $ capture i cs


--
-- User, parseUser
--

newtype User =
    User { _User :: LBS.ByteString }
  deriving (IsString,Ord,Eq,Show)

parse_user :: Replace a => a -> Maybe User
parse_user = Just . User . LBS.pack . unpackR


--
-- parse_pid_tid
--

parse_pid_tid :: Replace a => a -> Maybe (Int,Int)
parse_pid_tid x = case allMatches $ unpackR x S.*=~ [re|@{%nat}|] of
    [cs,cs'] -> (,) <$> p cs <*> p cs'
    _        -> Nothing
  where
    p cs = matchCapture cs >>= parseInteger . capturedText

regexType :: RegexType
regexType = PCRE.regexType