Text.RE.ZeInternals.NamedCaptures

{-# LANGUAGE DeriveGeneric              #-}
{-# LANGUAGE RecordWildCards            #-}
{-# LANGUAGE CPP                        #-}
#if __GLASGOW_HASKELL__ >= 800
{-# LANGUAGE TemplateHaskellQuotes      #-}
#else
{-# LANGUAGE QuasiQuotes                #-}
{-# LANGUAGE TemplateHaskell            #-}
#endif

module Text.RE.ZeInternals.NamedCaptures
  ( cp
  , extractNamedCaptures
  , idFormatTokenREOptions
  , Token(..)
  , validToken
  , formatTokens
  , formatTokens'
  , formatTokens0
  , scan
  ) where

import           Data.Char
import qualified Data.HashMap.Strict          as HM
import qualified Data.Text                    as T
import           GHC.Generics
import qualified Language.Haskell.TH          as TH
import           Language.Haskell.TH.Quote
import           Text.RE.ZeInternals.PreludeMacros
import           Text.RE.ZeInternals.QQ
import           Text.RE.ZeInternals.TestBench
import           Text.RE.ZeInternals.Tools.Lex
import           Text.RE.ZeInternals.Types.CaptureID
import           Text.RE.ZeInternals.Types.Match
import           Text.RE.ZeInternals.Types.Poss
import           Text.Regex.TDFA


-- | quasi quoter for CaptureID: @[cp|0|]@, @[cp|0|]@, etc.,
-- indexing captures by classic positional numbers, and @[cp|foo|]@,
-- etc., referencing a named capture @[re| ... ${foo}( ... ) ... |]@.
cp :: QuasiQuoter
cp =
    (qq0 "cp")
      { quoteExp = parse_capture
      }

-- | extract the CaptureNames from an RE or return an error diagnostic
-- if the RE is not well formed; also returs the total number of captures
-- in the RE
extractNamedCaptures :: String -> Either String ((Int,CaptureNames),String)
extractNamedCaptures s = Right (analyseTokens tks,formatTokens tks)
  where
    tks = scan s

Token

-- | our RE scanner returns a list of these tokens
data Token
  = ECap (Maybe String)
  | PGrp
  | PCap
  | Bra
  | BS          Char
  | Other       Char
  deriving (Show,Generic,Eq)

-- | check that a token is well formed
validToken :: Token -> Bool
validToken tkn = case tkn of
    ECap  mb -> maybe True check_ecap mb
    PGrp     -> True
    PCap     -> True
    Bra      -> True
    BS    c  -> is_dot c
    Other c  -> is_dot c
  where
    check_ecap s = not (null s) && all not_br s
    is_dot     c = c/='\n'
    not_br     c = not $ c `elem` "{}\n"

Analysing Token -> CaptureNames

-- | analyse a token stream, returning the number of captures and the
-- 'CaptureNames'
analyseTokens :: [Token] -> (Int,CaptureNames)
analyseTokens tks0 = case count_em 1 tks0 of
    (n,as) -> (n-1, HM.fromList as)
  where
    count_em n []       = (n,[])
    count_em n (tk:tks) = case count_em (n `seq` n+d) tks of
        (n',as) -> (n',bd++as)
      where
        (d,bd) = case tk of
          ECap (Just nm) -> (,) 1 [(CaptureName $ T.pack nm,CaptureOrdinal n)]
          ECap  Nothing  -> (,) 1 []
          PGrp           -> (,) 0 []
          PCap           -> (,) 1 []
          Bra            -> (,) 1 []
          BS    _        -> (,) 0 []
          Other _        -> (,) 0 []

Scanning Regex Strings

-- | scan a RE string into a list of RE Token
scan :: String -> [Token]
scan = alex' match al $ oops "top"
  where
    al :: [(Regex,Match String->Maybe Token)]
    al =
      [ mk "\\$\\{([^{}]+)\\}\\(" $         ECap . Just . x_1
      , mk "\\$\\("               $ const $ ECap Nothing
      , mk "\\(\\?:"              $ const   PGrp
      , mk "\\(\\?"               $ const   PCap
      , mk "\\("                  $ const   Bra
      , mk "\\\\(.)"              $         BS    . s2c . x_1
      , mk "(.|\n)"               $         Other . s2c . x_1
      ]

    x_1     = captureText $ IsCaptureOrdinal $ CaptureOrdinal 1

    s2c [c] = c
    s2c _   = oops "s2c"

    mk s f  = (poss error id $ makeRegexM s,Just . f)

    oops  m = error $ "NamedCaptures.scan: " ++ m

Parsing captures

parse_capture :: String -> TH.Q TH.Exp
parse_capture s = case all isDigit s of
  True  -> [|IsCaptureOrdinal $ CaptureOrdinal $ read s|]
  False -> [|IsCaptureName    $ CaptureName $ T.pack  s|]

Formatting Token

-- | format [Token] into an RE string
formatTokens :: [Token] -> String
formatTokens = formatTokens' defFormatTokenREOptions

-- | options for the general Token formatter below
data FormatTokenREOptions =
  FormatTokenREOptions
    { _fto_regex_type :: Maybe RegexType    -- ^ Posix, PCRE or indeterminate REs?
    , _fto_min_caps   :: Bool               -- ^ remove captures where possible
    , _fto_incl_caps  :: Bool               -- ^ include the captures in the output
    }
  deriving (Show)

-- | the default configuration for the Token formatter
defFormatTokenREOptions :: FormatTokenREOptions
defFormatTokenREOptions =
  FormatTokenREOptions
    { _fto_regex_type = Nothing
    , _fto_min_caps   = False
    , _fto_incl_caps  = False
    }

-- | a configuration that will preserve the parsed regular expression
-- in the output
idFormatTokenREOptions :: FormatTokenREOptions
idFormatTokenREOptions =
  FormatTokenREOptions
    { _fto_regex_type = Nothing
    , _fto_min_caps   = False
    , _fto_incl_caps  = True
    }

-- | the general Token formatter, generating REs according to the options
formatTokens' :: FormatTokenREOptions -> [Token] -> String
formatTokens' FormatTokenREOptions{..} = foldr f ""
  where
    f tk tl = t_s ++ tl
      where
        t_s = case tk of
          ECap  mb -> ecap mb
          PGrp     -> if maybe False isTDFA _fto_regex_type then "(" else "(?:"
          PCap     -> "(?"
          Bra      -> bra _fto_min_caps
          BS    c  -> "\\" ++ [c]
          Other c  -> [c]

    ecap mb = case _fto_incl_caps of
      True  -> case mb of
        Nothing -> "$("
        Just nm -> "${"++nm++"}("
      False -> bra _fto_min_caps

    bra mc  = case mc && maybe False isPCRE _fto_regex_type of
      True  -> "(?:"
      False -> "("

-- this is a reference of formatTokens defFormatTokenREOptions,
-- used for testing the latter
formatTokens0 :: [Token] -> String
formatTokens0 = foldr f ""
  where
    f tk tl = t_s ++ tl
      where
        t_s = case tk of
          ECap  _ -> "("
          PGrp    -> "(?:"
          PCap    -> "(?"
          Bra     -> "("
          BS    c -> "\\" ++ [c]
          Other c -> [c]