{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 800
{-# LANGUAGE TemplateHaskellQuotes #-}
#else
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
#endif
module Text.RE.ZeInternals.NamedCaptures
( cp
, extractNamedCaptures
, idFormatTokenREOptions
, Token(..)
, validToken
, formatTokens
, formatTokens'
, formatTokens0
, scan
) where
import Data.Char
import qualified Data.HashMap.Strict as HM
import qualified Data.Text as T
import GHC.Generics
import qualified Language.Haskell.TH as TH
import Language.Haskell.TH.Quote
import Text.RE.ZeInternals.PreludeMacros
import Text.RE.ZeInternals.QQ
import Text.RE.ZeInternals.TestBench
import Text.RE.ZeInternals.Tools.Lex
import Text.RE.ZeInternals.Types.CaptureID
import Text.RE.ZeInternals.Types.Match
import Text.RE.ZeInternals.Types.Poss
import Text.Regex.TDFA
-- | quasi quoter for CaptureID: @[cp|0|]@, @[cp|0|]@, etc.,
-- indexing captures by classic positional numbers, and @[cp|foo|]@,
-- etc., referencing a named capture @[re| ... ${foo}( ... ) ... |]@.
cp :: QuasiQuoter
cp =
(qq0 "cp")
{ quoteExp = parse_capture
}
-- | extract the CaptureNames from an RE or return an error diagnostic
-- if the RE is not well formed; also returs the total number of captures
-- in the RE
extractNamedCaptures :: String -> Either String ((Int,CaptureNames),String)
extractNamedCaptures s = Right (analyseTokens tks,formatTokens tks)
where
tks = scan s
Token
-- | our RE scanner returns a list of these tokens
data Token
= ECap (Maybe String)
| PGrp
| PCap
| Bra
| BS Char
| Other Char
deriving (Show,Generic,Eq)
-- | check that a token is well formed
validToken :: Token -> Bool
validToken tkn = case tkn of
ECap mb -> maybe True check_ecap mb
PGrp -> True
PCap -> True
Bra -> True
BS c -> is_dot c
Other c -> is_dot c
where
check_ecap s = not (null s) && all not_br s
is_dot c = c/='\n'
not_br c = not $ c `elem` "{}\n"
Analysing Token -> CaptureNames
-- | analyse a token stream, returning the number of captures and the
-- 'CaptureNames'
analyseTokens :: [Token] -> (Int,CaptureNames)
analyseTokens tks0 = case count_em 1 tks0 of
(n,as) -> (n-1, HM.fromList as)
where
count_em n [] = (n,[])
count_em n (tk:tks) = case count_em (n `seq` n+d) tks of
(n',as) -> (n',bd++as)
where
(d,bd) = case tk of
ECap (Just nm) -> (,) 1 [(CaptureName $ T.pack nm,CaptureOrdinal n)]
ECap Nothing -> (,) 1 []
PGrp -> (,) 0 []
PCap -> (,) 1 []
Bra -> (,) 1 []
BS _ -> (,) 0 []
Other _ -> (,) 0 []
Scanning Regex Strings
-- | scan a RE string into a list of RE Token
scan :: String -> [Token]
scan = alex' match al $ oops "top"
where
al :: [(Regex,Match String->Maybe Token)]
al =
[ mk "\\$\\{([^{}]+)\\}\\(" $ ECap . Just . x_1
, mk "\\$\\(" $ const $ ECap Nothing
, mk "\\(\\?:" $ const PGrp
, mk "\\(\\?" $ const PCap
, mk "\\(" $ const Bra
, mk "\\\\(.)" $ BS . s2c . x_1
, mk "(.|\n)" $ Other . s2c . x_1
]
x_1 = captureText $ IsCaptureOrdinal $ CaptureOrdinal 1
s2c [c] = c
s2c _ = oops "s2c"
mk s f = (poss error id $ makeRegexM s,Just . f)
oops m = error $ "NamedCaptures.scan: " ++ m
Parsing captures
parse_capture :: String -> TH.Q TH.Exp
parse_capture s = case all isDigit s of
True -> [|IsCaptureOrdinal $ CaptureOrdinal $ read s|]
False -> [|IsCaptureName $ CaptureName $ T.pack s|]
-- | format [Token] into an RE string
formatTokens :: [Token] -> String
formatTokens = formatTokens' defFormatTokenREOptions
-- | options for the general Token formatter below
data FormatTokenREOptions =
FormatTokenREOptions
{ _fto_regex_type :: Maybe RegexType -- ^ Posix, PCRE or indeterminate REs?
, _fto_min_caps :: Bool -- ^ remove captures where possible
, _fto_incl_caps :: Bool -- ^ include the captures in the output
}
deriving (Show)
-- | the default configuration for the Token formatter
defFormatTokenREOptions :: FormatTokenREOptions
defFormatTokenREOptions =
FormatTokenREOptions
{ _fto_regex_type = Nothing
, _fto_min_caps = False
, _fto_incl_caps = False
}
-- | a configuration that will preserve the parsed regular expression
-- in the output
idFormatTokenREOptions :: FormatTokenREOptions
idFormatTokenREOptions =
FormatTokenREOptions
{ _fto_regex_type = Nothing
, _fto_min_caps = False
, _fto_incl_caps = True
}
-- | the general Token formatter, generating REs according to the options
formatTokens' :: FormatTokenREOptions -> [Token] -> String
formatTokens' FormatTokenREOptions{..} = foldr f ""
where
f tk tl = t_s ++ tl
where
t_s = case tk of
ECap mb -> ecap mb
PGrp -> if maybe False isTDFA _fto_regex_type then "(" else "(?:"
PCap -> "(?"
Bra -> bra _fto_min_caps
BS c -> "\\" ++ [c]
Other c -> [c]
ecap mb = case _fto_incl_caps of
True -> case mb of
Nothing -> "$("
Just nm -> "${"++nm++"}("
False -> bra _fto_min_caps
bra mc = case mc && maybe False isPCRE _fto_regex_type of
True -> "(?:"
False -> "("
-- this is a reference of formatTokens defFormatTokenREOptions,
-- used for testing the latter
formatTokens0 :: [Token] -> String
formatTokens0 = foldr f ""
where
f tk tl = t_s ++ tl
where
t_s = case tk of
ECap _ -> "("
PGrp -> "(?:"
PCap -> "(?"
Bra -> "("
BS c -> "\\" ++ [c]
Other c -> [c]