Great job. You’re a genius. Making the regex multiline and commentes is real blessing.
I’ve added several whitespaces \s where i missed them.
I’ve added bounded templates <T extends A & B>
I’ve added multiple inheritance - interface can extend multiple interfaces
I’ve generalized the DECLARATOR and GENERIC groups a bit.
The result is far from perfect, but it completely satisfies my needs. I don’t mind if it recognizes multiple inheritance. We can leave some work for compiler too. I hope this class-function template will help with other languages.
<parser
displayName="Java"
id ="java"
>
<classRange
mainExpr ="(?x) # Utilize inline comments (see `RegEx - Pattern Modifiers`)
^[\t\x20]* # leading whitespace
(?:
(?-i:
abstract
|final
|native
|p(?:rivate|rotected|ublic)
|s(?:tatic|trictfp|ynchronized)
|transient
|volatile
|@[A-Za-z_]\w* # qualified identifier
(?:
\.
[A-Za-z_]\w*
)*
)
\s+
)*
(?-i:class|enum|@?interface)
\s+
(?'DECLARATOR'
(?'VALID_ID' # valid identifier, use as subroutine
\b(?!(?-i:
a(?:bstract|ssert)
|b(?:oolean|reak|yte)
|c(?:ase|atch|har|lass|on(?:st|tinue))
|d(?:efault|o(?:uble)?)
|e(?:lse|num|xtends)
|f(?:inal(?:ly)?|loat|or)
|goto
|i(?:f|mp(?:lements|ort)|nstanceof|nt(?:erface)?)
|long
|n(?:ative|ew)
|p(?:ackage|rivate|rotected|ublic)
|return
|s(?:hort|tatic|trictfp|uper|witch|ynchronized)
|th(?:is|rows?)
|tr(?:ansient|y)
|vo(?:id|latile)
|while
)\b) # keywords, not to be used as identifier
[A-Za-z_]\w* # valid character combination for identifiers
)
(?:
\s*\x3C # start-of-template indicator
(?'GENERIC' # ...match first generic, use as subroutine
\s*
(?:
(?&DECLARATOR) # use named generic
| \? # or unknown
)
(?: # optional type extension
\s+(?-i:extends|super)
\s+(?&DECLARATOR)
(?: # multiple bounds
\s+\x26 # ...are ampersand separated
\s+(?&DECLARATOR)
)*
)?
(?: # match consecutive generics objects
\s*, # ...comma separated
(?&GENERIC)
)?
)
\s*\x3E # ...end-of-template indicator
)?
(?: # package and|or nested classes
\. # ...are dot separated
(?&DECLARATOR)
)?
)
(?: # optional object extension
\s+(?-i:extends)
\s+(?&DECLARATOR)
(?: # ...match consecutive objects, they are
\s*, # separated by a comma
\s*(?&DECLARATOR)
)*
)?
(?: # optional object implementation
\s+(?-i:implements)
\s+(?&DECLARATOR) # ...match first object
(?: # ...match consecutive objects, they are
\s*, # separated by a comma
\s*(?&DECLARATOR)
)*
)?
\s*\{ # whatever, up till start-of-body indicator
"
openSymbole ="\{"
closeSymbole="\}"
>
<className>
<nameExpr expr="(?-i:class|enum|@?interface)\s+\K\w+(?:\s*\x3C.*?\x3E)?" />
</className>
<function
mainExpr="(?x) # Utilize inline comments (see `RegEx - Pattern Modifiers`)
^[\t\x20]* # leading whitespace
(?:
(?-i:
abstract
|final
|native
|p(?:rivate|rotected|ublic)
|s(?:tatic|trictfp|ynchronized)
|transient
|volatile
|@[A-Za-z_]\w* # qualified identifier
(?:
\. # ... dot separated
[A-Za-z_]\w*
)*
)
\s+
)*
(?:
\s*\x3C # start-of-template indicator
(?&GENERIC)
\s*\x3E # end-of-template indicator
)?
\s*
(?'DECLARATOR'
[A-Za-z_]\w* # type name
(?: # optional parent type name(s)
\. # ...parent-sibling separator
[A-Za-z_]\w* # ...parent type name
)*
(?:
\s*\x3C # start-of-template indicator
(?'GENERIC' # ...match first generic, use as subroutine
\s*
(?:
(?&DECLARATOR) # use named generic
| \? # or unknown
)
(?: # optional type extension
\s+(?-i:extends|super)
\s+(?&DECLARATOR)
(?: # multiple bounds
\s+\x26 # ...are ampersand separated
\s+(?&DECLARATOR)
)*
)?
(?: # match consecutive generics objects
\s*, # ...comma separated
(?&GENERIC)
)?
)
\s*\x3E # ...end-of-template indicator
)?
(?: # package and|or nested classes
\. # ... are dot separated
(?&DECLARATOR)
)?
(?: # optional compound type
\s*\[ # ...start-of-compound indicator
\s*\] # ...end-of-compound indicator
)*
)
\s+
(?'VALID_ID' # valid identifier, use as subroutine
\b(?!(?-i:
a(?:bstract|ssert)
|b(?:oolean|reak|yte)
|c(?:ase|atch|har|lass|on(?:st|tinue))
|d(?:efault|o(?:uble)?)
|e(?:lse|num|xtends)
|f(?:inal(?:ly)?|loat|or)
|goto
|i(?:f|mp(?:lements|ort)|nstanceof|nt(?:erface)?)
|long
|n(?:ative|ew)
|p(?:ackage|rivate|rotected|ublic)
|return
|s(?:hort|tatic|trictfp|uper|witch|ynchronized)
|th(?:is|rows?)
|tr(?:ansient|y)
|vo(?:id|latile)
|while
)\b) # keywords, not to be used as identifier
[A-Za-z_]\w* # valid character combination for identifiers
)
\s*\( # start-of-arguments indicator
(?'ARG' # ...match first argument, use as subroutine
\s*(?-i:final\s+)?
(?&DECLARATOR)
\s+(?&VALID_ID) # argument name
(?: # ...consecutive arguments are
\s*, # separated by commas
(?&ARG)
)?
)?
\) # end-of-arguments indicator
(?: # optional exceptions
\s*(?-i:throws)
\s+(?&VALID_ID) # ...first exception name
(?: # ...consecutive exception names are
\s*, # separated by commas
\s*(?&VALID_ID)
)*
)?
\s*(?: # function declaration ends with ...
\{ # ...a start-of-function-body indicator or
| ; # ...an end-of-declaration indicator
)
"
>
<functionName>
<funcNameExpr expr="\w+(?=\s*\()" />
</functionName>
</function>
</classRange>
</parser>