This tutorial is a literate Haskell program whwre we start by specifying the language pragmas and imports we will need for this module.
{-# LANGUAGE QuasiQuotes #-}
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.List
import Text.RE.Replace
import Text.RE.TDFA.String
import Text.RE.Tools
The IsRegex re tx
provides regex methods for the RE type re
(belonging to either the TDFA or PCRE back end) and a text type tx
that the re
back end accepts. The Text.RE.TDFA
and Text.RE.PCRE
API modules provide functions that work over all the text types, with the following match operators:
(*=~) :: IsRegex RE s
=> s
-> RE
-> Matches s
(?=~) :: IsRegex RE s
=> s
-> RE
-> Match s
(*=~/) :: IsRegex RE s => s -> SearchReplace RE s -> s
(?=~/) :: IsRegex RE s => s -> SearchReplace RE s -> s
The IsRegex
class is located in Text.RE.Tools.IsRegex
:
-- | the 'IsRegex' class allows polymorhic tools to be written that
-- will work with a variety of regex back ends and text types
class Replace s => IsRegex re s where
-- | finding the first match
matchOnce :: re -> s -> Match s
-- | finding all matches
matchMany :: re -> s -> Matches s
-- | compiling an RE, failing if the RE is not well formed
makeRegex :: (Functor m,Monad m, MonadFail m) => s -> m re
-- | comiling an RE, specifying the 'SimpleREOptions'
makeRegexWith :: (Functor m,Monad m, MonadFail m) => SimpleREOptions -> s -> m re
-- | compiling a 'SearchReplace' template from the RE text and the template Text, failing if they are not well formed
makeSearchReplace :: (Functor m,Monad m, MonadFail m,IsRegex re s) => s -> s -> m (SearchReplace re s)
-- | compiling a 'SearchReplace' template specifing the 'SimpleREOptions' for the RE
makeSearchReplaceWith :: (Functor m,Monad m, MonadFail m,IsRegex re s) => SimpleREOptions -> s -> s -> m (SearchReplace re s)
-- | incorporate an escaped string into a compiled RE with the default options
makeEscaped :: (Functor m,Monad m, MonadFail m) => (s->s) -> s -> m re
-- | incorporate an escaped string into a compiled RE with the specified 'SimpleREOptions'
makeEscapedWith :: (Functor m,Monad m, MonadFail m) => SimpleREOptions -> (s->s) -> s -> m re
-- | extract the text of the RE from the RE
regexSource :: re -> s
= makeRegexWith minBound
makeRegex = makeSearchReplaceWith minBound
makeSearchReplace = makeEscapedWith minBound
makeEscaped = makeRegexWith o . f . packR . escapeREString . unpackR makeEscapedWith o f
(?=~%) :: IsRegex re s => s -> re -> Match s
?=~%) = flip matchOnce
(
(*=~%) :: IsRegex re s => s -> re -> Matches s
*=~%) = flip matchMany (
> matched $ (LBS.pack "2016-01-09 2015-12-5 2015-10-05") ?=~% [re|[0-9]{4}-[0-9]{2}-[0-9]{2}|]
ghciTrue
> countMatches $ (LBS.pack "2016-01-09 2015-12-5 2015-10-05") *=~% [re|[0-9]{4}-[0-9]{2}-[0-9]{2}|]
ghci2
regex
provides some classic tools that have quickly proven themselves in the examples and scripts used to maintain regex itself.
The classic tools assocciated with regular expressions have inspired some regex conterparts.
Text.RE.Tools.Grep: takes a regular expression and a file or lazy ByteString (depending upon the variant) and returns all of the matching lines.
Text.RE.Tools.Lex: takes an association list of REs and token-generating functions and the input text and returns a list of tokens. This should never be used where performance is important (use Alex), except as a development prototype.
Text.RE.Tools.Sed using Text.RE.Tools.Edit: takes an association list of regular expressions and substitution actions, some input text and invokes the associated action on each line of the file that matches one of the REs, substituting the text returned from the action in the output stream.
Text.RE.Tools.Find: scans a directory tree in the file system executing an action against all of the files that match RE.
These tools are built on top of the core library and act as good examples of how to use the regex library as well as useful tools.
The following sections will present some of the internal library code used to build the tools as well as some code from the example programs. These fragments work best as starting points for studying these tools.
Edits
scripts are applied to each line of the text by the sed
functions.
-- | an 'Edits' script will, for each line in the file, either perform
-- the action selected by the first RE in the list, or perform all of the
-- actions on line, arranged as a pipeline
data Edits m re s
= Select ![Edit m re s] -- ^ for each line select the first @Edit@ to match each line and edit the line with it
| Pipe ![Edit m re s] -- ^ for each line apply every edit that matches in turn to the line
-- | each Edit action specifies how the match should be processed
data Edit m re s
= Template !(SearchReplace re s)
-- ^ replace the match with this template text, substituting ${capture} as apropriate
| Function !re REContext !(LineNo->Match s->RELocation->Capture s->m (Maybe s))
-- ^ use this function to replace the 'REContext' specified captures in each line matched
| LineEdit !re !(LineNo->Matches s->m (LineEdit s))
-- ^ use this function to edit each line matched
-- | a LineEdit is the most general action thar can be performed on a line
-- and is the only means of deleting a line
data LineEdit s
= NoEdit -- ^ do not edit this line but leave as is
| ReplaceWith !s -- ^ replace the line with this text (terminating newline should not be included)
| Delete -- ^ delete the this line altogether
deriving (Functor,Show)
sed'
applies the script in its first argument to each line in the text in its second argument.
-- | apply an 'Edits' script to each line of the argument text
sed' :: (IsRegex re a,Monad m,Functor m)
=> Edits m re a
-> a
-> m a
= do
sed' escr t mconcat <$> sequence
[ applyEdits lno escr s| (lno,s)<-zip [firstLine..] $ linesR t
]
The sed'
function is used to build the include processor in the TestKit
utility modules used by the example scripts and programs. To filter lines to exclude the grepFilter
function is used.
-- | this function looks for lines of the form
--
-- `%include <file> [exclude <RE>]`
--
-- and replaces them with the contents of the named file, optionally
-- excluding any lines that match the given RE.
include :: LBS.ByteString -> IO LBS.ByteString
= sed' $ Select
include Function [re|^%include ${file}(@{%string})$|] TOP incl
[ Function [re|^%include ${file}(@{%string}) *exclude *${rex}(@{%string})$|] TOP incl
, Function [re|^.*$|] TOP nop
,
]where
= include' mtch
incl _ mtch _ _ = return Nothing
nop _ _ _ _
-- | processes the match from a '%include' line, analyses the match,
-- fetches the file, optionally excludes lines specified by an RE,
-- returning the text to include.
include' :: Match LBS.ByteString -> IO (Maybe LBS.ByteString)
= do
include' mtch <- case prs_s <$> mtch !$$? [cp|rex|] of
ftr Nothing -> return id
Just re_lbs -> excl <$> makeRegex re_lbs
Just . ftr <$> LBS.readFile (prs_s $ mtch !$$ [cp|file|])
where
excl :: RE -> LBS.ByteString -> LBS.ByteString
=
excl rex . map (matchesSource . getLineMatches)
LBS.unlines . filter (not . anyMatches . getLineMatches)
. grepFilter rex
= maybe (error "include'") T.unpack . parseString prs_s
The grepFilter
function takes an RE and a text and returns the result of matching the RE to every line in the file.
-- | returns a 'Line' for each line in the argument text, enumerating
-- all of the matches for that line
grepFilter :: IsRegex re s => re -> s -> [Line s]
= grepWithScript [(rex,mk)] . linesR
grepFilter rex where
= Just $ Line i mtchs mk i mtchs
-- | 'grepLines' returns a 'Line' for each line in the file, listing all
-- of the 'Matches' for that line
data Line s =
Line
getLineNumber :: LineNo -- ^ the 'LineNo' for this line
{ getLineMatches :: Matches s -- ^ all the 'Matches' of the RE on this line
,
}deriving (Show)
The sortImports
utility in the TestKit
utility module used by the scripts and example programs. It uses grep
to sort all of the imports by the name of the module in a single block located at the position of the first import statement in the module, where each import statement is in a standard form matched by the regex
[re|^import +(qualified| ) ${mod}([^ ].*)$|]
sortImports
under the name sortImports_
here.
sortImports_ :: LBS.ByteString -> LBS.ByteString
=
sortImports_ lbs $ map (matchesSource . getLineMatches) $
LBS.unlines ++ sortBy cMp bdy
hdr where
= case (extr ln1,extr ln2) of
cMp ln1 ln2 Nothing,Nothing) -> EQ
(Nothing,Just _ ) -> GT
(Just _ ,Nothing) -> LT
(Just x ,Just y) -> compare x y
(
= case allMatches $ getLineMatches ln of
extr ln :_ -> mtch !$$? [cp|mod|]
mtch-> Nothing
_
= span (not . anyMatches . getLineMatches) lns
(hdr,bdy) = grepFilter rex lbs
lns = [re|^import +(qualified )? *${mod}([^ ].*)$|] rex
> sortImports_ $ LBS.pack "-- preamble\nimport qualified Data.Text as T\nimport Data.List\n-- done\n"
ghci"-- preamble\nimport Data.List\nimport qualified Data.Text as T\n-- done\n"
The Lex toolkit can be used for quickly knocking together scanners that do not need to be efficient.
-- | a simple regex-based scanner interpretter for prototyping
-- scanners
alex :: IsRegex re s => [(re,Match s->Maybe t)] -> t -> s -> [t]
= alex' matchOnce
alex
-- | a higher order version of 'alex' parameterised over the @matchOnce@
-- function
alex' :: Replace s
=> (re->s->Match s)
-> [(re,Match s->Maybe t)]
-> t
-> s
-> [t]
= loop
alex' mo al t_err where
= case lengthR s == 0 of
loop s True -> []
False -> choose al s
= [t_err]
choose [] _ :al') s = case mb_p of
choose ((re,f)Just (s',t) -> t : loop s'
-> choose al' s
_ where
= do
mb_p <- matchCapture mtch
cap case captureOffset cap == 0 of
True -> (,) (captureSuffix cap) <$> f mtch
False -> Nothing
= mo re s mtch
It has been used in the library to scan REs so that the captures can be picked out, numbered and that number associated with a name where one has been given.
-- | scan a RE string into a list of RE Token
scan :: String -> [Token]
= alex' match al $ oops "top"
scan where
al :: [(Regex,Match String->Maybe Token)]
=
al "\\$\\{([^{}]+)\\}\\(" $ ECap . Just . x_1
[ mk "\\$\\(" $ const $ ECap Nothing
, mk "\\(\\?:" $ const PGrp
, mk "\\(\\?" $ const PCap
, mk "\\(" $ const Bra
, mk "\\\\(.)" $ BS . s2c . x_1
, mk "(.|\n)" $ Other . s2c . x_1
, mk
]
= captureText $ IsCaptureOrdinal $ CaptureOrdinal 1
x_1
= c
s2c [c] = oops "s2c"
s2c _
= (poss error id $ makeRegexM s,Just . f)
mk s f
= error $ "NamedCaptures.scan: " ++ m oops m
The findMatches_
function lists all of the files in a directort tree that match an RE.
-- | recursively list all files whose filename matches given RE,
-- sorting the list into ascending order; if the argument path has a
-- trailing '/' then it will be removed
findMatches_ :: IsRegex re s => FindMethods s -> re -> s -> IO [s]
= findMatches_' fm L.sort matched
findMatches_ fm
-- | recursively list all files whose filename matches given RE,
-- using the given function to determine which matches to accept
findMatches_' :: IsRegex re s
=> FindMethods s -- ^ the directory and filepath methods
-> ([s]->[s]) -- ^ result post-processing function
-> (Match s->Bool) -- ^ filtering function
-> re -- ^ re to be matched against the leaf filename
-> s -- ^ root directory of the search
-> IO [s]
= srt <$> find_ fm tst re (packR "") fp
findMatches_' fm srt tst re fp
find_ :: IsRegex re s
=> FindMethods s
-> (Match s->Bool)
-> re
-> s
-> s
-> IO [s]
@FindMethods{..} tst re fn fp = do
find_ fm<- doesDirectoryExistDM fp
is_dir case is_dir of
True -> do
<- filter ordinary <$> listDirectoryDM fp
fns concat <$>
mapM (uncurry $ find_ fm tst re) [ (fn_,abs_path fn_) | fn_<-fns ]
False -> return [ fp | lengthR fp /= 0 && tst (matchOnce re fn) ]
where
= fp `combineDM` fn_
abs_path fn_ = not $ fn_ `elem` [packR ".",packR ".."] ordinary fn_
It is used by the re-sort-imports
program to discover all of the Haskell scripts in the regex source tree and sort their import statements into a standard order (ultimately using the above-mentioned sortImport
function).
sort_r :: Mode -> FilePath -> IO ()
= findMatches_ fm [re|\.l?hs|] root >>= sort_these md root
sort_r md root where
= FindMethods
fm = doesDirectoryExist
{ doesDirectoryExistDM = getDirectoryContents
, listDirectoryDM = (</>)
, combineDM }