package org.gbif.nameparser;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.spi.LocationInfo;
import org.gbif.api.exception.UnparsableException;
import org.gbif.api.model.checklistbank.ParsedName;
import org.gbif.api.service.checklistbank.NameParser;
import org.gbif.api.vocabulary.NameType;
import org.gbif.api.vocabulary.NomenclaturalCode;
import org.gbif.api.vocabulary.Rank;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/name-parser-2.21.jar:org/gbif/nameparser/GBIFNameParser.class */
public class GBIFNameParser implements NameParser {
    private final NormalisedNameParser nnParser;
    public static final String HYBRID_MARKER = "×";
    public static final Pattern HYBRID_FORMULA_PATTERN;
    public static final String EXTINCT_MARKER = "†";
    private static final Pattern EXTINCT_PATTERN;

    @VisibleForTesting
    protected static final Pattern CULTIVAR;
    private static final Pattern CULTIVAR_GROUP;
    private static final Pattern STRAIN;
    public static final Pattern IS_VIRUS_PATTERN;
    public static final Pattern IS_VIRUS_PATTERN_CASE_SENSITIVE;
    private static final Pattern IS_VIRUS_PATTERN_POSTFAIL;
    public static final Pattern IS_GENE;
    private static final String CANDIDATUS = "(Candidatus\\s|Ca\\.)\\s*";
    private static final Pattern IS_CANDIDATUS_PATTERN;
    private static final Pattern IS_CANDIDATUS_QUOTE_PATTERN;
    private static final Pattern RANK_MARKER_AT_END;
    private static final String SENSU = "(s\\.(?:l\\.|str\\.)|sensu\\s+(?:latu|strictu?)|(sec|sensu|auct|non)((\\.|\\s)(.*))?)";
    private static final Pattern EXTRACT_SENSU;
    private static final String NOV_RANKS = "fam|gen|sp|ssp|var|forma";
    private static final Pattern NOV_RANK_MARKER;
    protected static final Pattern EXTRACT_NOMSTATUS;
    private static final Pattern EXTRACT_REMARKS;
    private static final Pattern EXTRACT_YEAR;
    private static final Pattern COMMA_BEFORE_YEAR;
    private static final Pattern REPLACE_QUOTES;
    private static final Pattern NORM_QUOTES;
    private static final Pattern NORM_UPPERCASE_WORDS;
    private static final Pattern NORM_WHITESPACE;
    private static final Pattern NORM_NO_SQUARE_BRACKETS;
    private static final Pattern NORM_BRACKETS_OPEN;
    private static final Pattern NORM_BRACKETS_CLOSE;
    private static final Pattern NORM_BRACKETS_OPEN_STRONG;
    private static final Pattern NORM_BRACKETS_CLOSE_STRONG;
    private static final Pattern NORM_AND;
    private static final Pattern NORM_ET_AL;
    private static final Pattern NORM_AMPERSAND_WS;
    private static final Pattern NORM_HYPHENS;
    private static final Pattern NORM_SUBGENUS;
    private static final Pattern NO_Q_MARKS;
    private static final Pattern NORM_COMMAS;
    private static final Pattern NORM_ORIG_AUTH;
    private static final Pattern NORM_ORIG_AUTH2;
    private static final Pattern NORM_IMPRINT_YEAR;
    private static final Pattern NORM_HYBRIDS_GENUS;
    private static final Pattern NORM_HYBRIDS_EPITH;
    private static final Pattern NORM_HYBRIDS_FORM;
    private static final Pattern NORM_INDET;
    private static final Pattern NORM_DOTS;
    private static final Pattern NORM_TF_GENUS;
    private static final Pattern NORM_IN_COMMA;
    private static final Pattern NORM_IN_BIB;
    private static final Pattern NORM_PREFIXES;
    private static final Pattern NORM_SUFFIXES;
    private static final Pattern NO_LETTERS;
    private static final String PLACEHOLDER_AUTHOR = "(?:unknown|unspecified|uncertain|\\?)";
    private static final Pattern REMOVE_PLACEHOLDER_AUTHOR;
    private static final String PLACEHOLDER_NAME = "(?:unnamed|mixed|unassigned|unallocated|unplaced|undetermined|unclassified|uncultured|unknown|unspecified|uncertain|incertae sedis|not assigned|awaiting allocation|temp|dummy)";
    private static final Pattern REMOVE_PLACEHOLDER_INFRAGENERIC;
    private static final Pattern PLACEHOLDER;
    private static final Pattern DOUBTFUL;
    private static final Pattern DOUBTFUL2;
    private static final Pattern BAD_NAME_SUFFICES;
    private static final Pattern XML_ENTITY_STRIP;
    private static final Pattern AMPERSAND_ENTITY;
    private static final Pattern XML_TAGS;
    private static final Pattern FIRST_WORD;
    private static final String WEIRD_CHARS = "[§$%/#+!;:_|\"=*]";
    private static final Pattern NORM_WEIRD_CHARS;
    private static final Pattern FORM_SPECIALIS;
    private static final Pattern SENSU_LATU;
    private static final Pattern TYPE_TO_VAR;
    private static final Pattern COMB_BAS_AUTHOR_SWAP;
    private static Logger LOG = LoggerFactory.getLogger((Class<?>) GBIFNameParser.class);
    private static char[] QUOTES = new char[4];

    public GBIFNameParser() {
        this.nnParser = new NormalisedNameParser(500L);
    }

    public GBIFNameParser(long j) {
        this.nnParser = new NormalisedNameParser(j / 2);
    }

    @Override // org.gbif.api.service.checklistbank.NameParser
    public ParsedName parse(String str, @Nullable Rank rank) throws UnparsableException {
        if (Strings.isNullOrEmpty(str)) {
            throw new UnparsableException(NameType.NO_NAME, str);
        }
        long j = 0;
        if (LOG.isDebugEnabled()) {
            j = System.currentTimeMillis();
        }
        ParsedName parsedName = new ParsedName();
        parsedName.setScientificName(str);
        String replaceFirst = EXTINCT_PATTERN.matcher(preClean(str)).replaceFirst("");
        Matcher matcher = IS_CANDIDATUS_QUOTE_PATTERN.matcher(str);
        if (matcher.find()) {
            parsedName.setType(NameType.CANDIDATUS);
            replaceFirst = matcher.replaceFirst(matcher.group(2));
        }
        String replaceAll = TYPE_TO_VAR.matcher(replaceFirst).replaceAll("$1var");
        Matcher matcher2 = STRAIN.matcher(replaceAll);
        if (matcher2.find()) {
            replaceAll = matcher2.replaceFirst(matcher2.group(1));
            parsedName.setType(NameType.INFORMAL);
            parsedName.setStrain(matcher2.group(2));
            LOG.debug("Strain: {}", matcher2.group(2));
        }
        Matcher matcher3 = REMOVE_PLACEHOLDER_AUTHOR.matcher(replaceAll);
        if (matcher3.find()) {
            replaceAll = matcher3.replaceFirst(" $1");
            parsedName.setType(NameType.PLACEHOLDER);
        }
        Matcher matcher4 = REMOVE_PLACEHOLDER_INFRAGENERIC.matcher(replaceAll);
        if (matcher4.find()) {
            replaceAll = matcher4.replaceFirst("");
            parsedName.setType(NameType.PLACEHOLDER);
        }
        if (PLACEHOLDER.matcher(replaceAll).find()) {
            throw new UnparsableException(NameType.PLACEHOLDER, str);
        }
        if (IS_VIRUS_PATTERN.matcher(replaceAll).find() || IS_VIRUS_PATTERN_CASE_SENSITIVE.matcher(replaceAll).find()) {
            throw new UnparsableException(NameType.VIRUS, str);
        }
        if (IS_GENE.matcher(replaceAll).find()) {
            parsedName.setType(NameType.INFORMAL);
        }
        String normalize = normalize(replaceAll);
        if (Strings.isNullOrEmpty(normalize)) {
            throw new UnparsableException(NameType.NO_NAME, str);
        }
        Matcher matcher5 = CULTIVAR_GROUP.matcher(normalize);
        if (matcher5.find()) {
            parsedName.setCultivarEpithet(matcher5.group(1));
            normalize = matcher5.replaceFirst(" ");
            parsedName.setType(NameType.CULTIVAR);
            if (matcher5.group(2).equalsIgnoreCase("grex")) {
                parsedName.setRank(Rank.GREX);
            } else {
                parsedName.setRank(Rank.CULTIVAR_GROUP);
            }
        }
        Matcher matcher6 = CULTIVAR.matcher(normalize);
        if (matcher6.find()) {
            parsedName.setCultivarEpithet(matcher6.group(1));
            normalize = matcher6.replaceFirst(" ");
            parsedName.setType(NameType.CULTIVAR);
            parsedName.setRank(Rank.CULTIVAR);
        }
        if (NO_LETTERS.matcher(normalize).find()) {
            throw new UnparsableException(NameType.NO_NAME, str);
        }
        if (HYBRID_FORMULA_PATTERN.matcher(normalize).find()) {
            throw new UnparsableException(NameType.HYBRID, str);
        }
        Matcher matcher7 = IS_CANDIDATUS_PATTERN.matcher(normalize);
        if (matcher7.find()) {
            parsedName.setType(NameType.CANDIDATUS);
            normalize = matcher7.replaceFirst("");
        }
        Matcher matcher8 = EXTRACT_NOMSTATUS.matcher(normalize);
        if (matcher8.find()) {
            parsedName.setNomStatus(StringUtils.trimToNull(matcher8.group(1)));
            normalize = matcher8.replaceFirst("");
            if (parsedName.getNomStatus() != null) {
                Matcher matcher9 = NOV_RANK_MARKER.matcher(parsedName.getNomStatus());
                if (matcher9.find()) {
                    NormalisedNameParser.setRank(parsedName, matcher9.group(1));
                }
            }
        }
        Matcher matcher10 = EXTRACT_SENSU.matcher(normalize);
        if (matcher10.find()) {
            parsedName.setSensu(StringUtils.trimToNull(matcher10.group(1)));
            normalize = matcher10.replaceFirst("");
        }
        Matcher matcher11 = EXTRACT_REMARKS.matcher(normalize);
        if (matcher11.find()) {
            parsedName.setRemarks(StringUtils.trimToNull(matcher11.group(1)));
            normalize = matcher11.replaceFirst("");
        }
        if (parsedName.getType() != NameType.CULTIVAR) {
            Matcher matcher12 = RANK_MARKER_AT_END.matcher(normalize);
            if (matcher12.find() && !normalize.endsWith(" f.") && !normalize.endsWith(" f")) {
                parsedName.setType(NameType.INFORMAL);
                NormalisedNameParser.setRank(parsedName, matcher12.group(2));
                normalize = matcher12.replaceAll("");
            }
            Matcher matcher13 = NORM_INDET.matcher(normalize);
            if (matcher13.find()) {
                parsedName.setType(NameType.INFORMAL);
                normalize = matcher13.replaceAll(" ");
            }
        }
        String normalizeStrong = normalizeStrong(normalize);
        if (Strings.isNullOrEmpty(normalizeStrong)) {
            throw new UnparsableException(NameType.DOUBTFUL, str);
        }
        Rank rank2 = parsedName.getRank();
        if (!this.nnParser.parseNormalisedName(parsedName, normalizeStrong, rank)) {
            LOG.debug("Can't parse, use dirty normalizer");
            String cleanStrong = cleanStrong(normalizeStrong);
            if (!this.nnParser.parseNormalisedName(parsedName, cleanStrong, rank)) {
                LOG.debug("Still can't parse, try to ignore authors");
                boolean parseNormalisedNameIgnoreAuthors = this.nnParser.parseNormalisedNameIgnoreAuthors(parsedName, cleanStrong, rank);
                parsedName.setAuthorsParsed(false);
                if (!parseNormalisedNameIgnoreAuthors) {
                    if (IS_VIRUS_PATTERN_POSTFAIL.matcher(normalizeStrong).find()) {
                        throw new UnparsableException(NameType.VIRUS, str);
                    }
                    throw new UnparsableException(NameType.DOUBTFUL, str);
                }
            }
        }
        if (rank2 != null) {
            parsedName.setRank(rank2);
        }
        postAssertParsing(parsedName, str, normalizeStrong);
        determineNameType(parsedName, str);
        if (parsedName.getRank() == null) {
            parsedName.setRank(RankUtils.inferRank(parsedName));
        }
        LOG.debug("Parsing time: {}", Long.valueOf(System.currentTimeMillis() - j));
        return parsedName;
    }

    @Override // org.gbif.api.service.checklistbank.NameParser
    public ParsedName parse(String str) throws UnparsableException {
        return parse(str, null);
    }

    @Override // org.gbif.api.service.checklistbank.NameParser
    public ParsedName parseQuietly(String str, @Nullable Rank rank) {
        ParsedName parsedName;
        try {
            parsedName = parse(str, rank);
        } catch (UnparsableException e) {
            parsedName = new ParsedName();
            parsedName.setScientificName(str);
            parsedName.setRank(rank);
            parsedName.setType(e.type);
            parsedName.setParsed(false);
            parsedName.setAuthorsParsed(false);
        }
        return parsedName;
    }

    @Override // org.gbif.api.service.checklistbank.NameParser
    public ParsedName parseQuietly(String str) {
        return parseQuietly(str, null);
    }

    @Override // org.gbif.api.service.checklistbank.NameParser
    public String parseToCanonical(String str, @Nullable Rank rank) {
        if (Strings.isNullOrEmpty(str)) {
            return null;
        }
        try {
            ParsedName parse = parse(str, rank);
            if (parse != null) {
                return parse.canonicalName();
            }
            return null;
        } catch (UnparsableException e) {
            LOG.warn("Unparsable name " + str + " >>> " + e.getMessage());
            return null;
        }
    }

    @Override // org.gbif.api.service.checklistbank.NameParser
    public String parseToCanonical(String str) {
        return parseToCanonical(str, null);
    }

    public String parseToCanonicalOrScientificName(String str, @Nullable Rank rank) {
        if (Strings.isNullOrEmpty(str)) {
            return null;
        }
        try {
            ParsedName parse = parse(str, rank);
            if (parse != null) {
                return parse.canonicalName();
            }
        } catch (UnparsableException e) {
            LOG.warn("Unparsable name " + str + " >>> " + e.getMessage());
        }
        return StringUtils.normalizeSpace(str.trim());
    }

    protected static String cleanStrong(String str) {
        if (str != null) {
            Matcher matcher = BAD_NAME_SUFFICES.matcher(str);
            if (matcher.find()) {
                str = matcher.replaceAll("");
            }
            str = NORM_WEIRD_CHARS.matcher(str).replaceAll(" ");
            Matcher matcher2 = FIRST_WORD.matcher(str);
            if (matcher2.find() && matcher2.group(2) == null) {
                str = matcher2.replaceFirst(StringUtils.defaultString(matcher2.group(1)) + matcher2.group(3).toUpperCase() + matcher2.group(4).toLowerCase() + " ");
            }
            Matcher matcher3 = NORM_HYBRIDS_GENUS.matcher(str);
            if (matcher3.find()) {
                str = matcher3.replaceFirst("×$1");
            }
        }
        return str;
    }

    public static String normalize(String str) {
        if (str == null) {
            return null;
        }
        String unescapeUnicodeChars = org.gbif.utils.text.StringUtils.unescapeUnicodeChars(str);
        Matcher matcher = FORM_SPECIALIS.matcher(unescapeUnicodeChars);
        if (matcher.find()) {
            unescapeUnicodeChars = matcher.replaceAll("fsp");
        }
        Matcher matcher2 = SENSU_LATU.matcher(unescapeUnicodeChars);
        if (matcher2.find()) {
            unescapeUnicodeChars = matcher2.replaceAll("sl");
        }
        Matcher matcher3 = NORM_DOTS.matcher(unescapeUnicodeChars);
        if (matcher3.find()) {
            unescapeUnicodeChars = matcher3.replaceAll("$1. ");
        }
        Matcher matcher4 = COMMA_BEFORE_YEAR.matcher(unescapeUnicodeChars);
        if (matcher4.find()) {
            unescapeUnicodeChars = matcher4.replaceAll("$1, $2");
        }
        String replaceAll = NORM_AMPERSAND_WS.matcher(NORM_HYPHENS.matcher(unescapeUnicodeChars).replaceAll("-")).replaceAll(" & ");
        Matcher matcher5 = NORM_BRACKETS_OPEN.matcher(replaceAll);
        if (matcher5.find()) {
            replaceAll = matcher5.replaceAll(" $1");
        }
        Matcher matcher6 = NORM_BRACKETS_CLOSE.matcher(replaceAll);
        if (matcher6.find()) {
            replaceAll = matcher6.replaceAll("$1 ");
        }
        Matcher matcher7 = NORM_COMMAS.matcher(replaceAll);
        if (matcher7.find()) {
            replaceAll = matcher7.replaceAll(", ");
        }
        Matcher matcher8 = NORM_HYBRIDS_GENUS.matcher(replaceAll);
        if (matcher8.find()) {
            replaceAll = matcher8.replaceFirst("×$1");
        }
        Matcher matcher9 = NORM_HYBRIDS_EPITH.matcher(replaceAll);
        if (matcher9.find()) {
            replaceAll = matcher9.replaceFirst("$1 ×$2");
        }
        Matcher matcher10 = NORM_HYBRIDS_FORM.matcher(replaceAll);
        if (matcher10.find()) {
            replaceAll = matcher10.replaceAll(" × ");
        }
        Matcher matcher11 = NORM_UPPERCASE_WORDS.matcher(replaceAll);
        while (matcher11.find()) {
            replaceAll = replaceAll.replaceFirst(matcher11.group(0), matcher11.group(1) + matcher11.group(2).toLowerCase());
        }
        return StringUtils.trimToEmpty(NORM_WHITESPACE.matcher(replaceAll).replaceAll(" "));
    }

    @VisibleForTesting
    static String normalizeStrong(String str) {
        if (str == null) {
            return null;
        }
        String replaceAll = REPLACE_QUOTES.matcher(NORM_QUOTES.matcher(str).replaceAll("'")).replaceAll("");
        Matcher matcher = NO_Q_MARKS.matcher(replaceAll);
        if (matcher.find()) {
            replaceAll = matcher.replaceAll("$1");
        }
        String replaceAll2 = NORM_PREFIXES.matcher(replaceAll).replaceAll("");
        Matcher matcher2 = NORM_TF_GENUS.matcher(replaceAll2);
        if (matcher2.find()) {
            replaceAll2 = matcher2.replaceAll("$1$2 ");
        }
        Matcher matcher3 = NORM_IMPRINT_YEAR.matcher(replaceAll2);
        if (matcher3.find()) {
            replaceAll2 = matcher3.replaceAll("$1");
        }
        String replaceFirst = NORM_IN_COMMA.matcher(replaceAll2).replaceFirst(" in ");
        Matcher matcher4 = NORM_UPPERCASE_WORDS.matcher(replaceFirst);
        while (matcher4.find()) {
            replaceFirst = replaceFirst.replaceFirst(matcher4.group(0), matcher4.group(1) + matcher4.group(2).toLowerCase());
        }
        if (EXTRACT_YEAR.matcher(replaceFirst).find() && replaceFirst.length() < 80) {
            Matcher matcher5 = NORM_ORIG_AUTH.matcher(replaceFirst);
            if (matcher5.find()) {
                replaceFirst = matcher5.replaceAll("($1 $2)");
            }
            Matcher matcher6 = NORM_ORIG_AUTH2.matcher(replaceFirst);
            if (matcher6.find()) {
                replaceFirst = matcher6.replaceAll("($1 $2)");
            }
        }
        String replaceAll3 = NORM_SUFFIXES.matcher(NORM_ET_AL.matcher(NORM_AND.matcher(NORM_BRACKETS_CLOSE_STRONG.matcher(NORM_BRACKETS_OPEN_STRONG.matcher(NORM_NO_SQUARE_BRACKETS.matcher(replaceFirst).replaceAll(" $1 ")).replaceAll(" (")).replaceAll(") ")).replaceAll(" & ")).replaceAll(" et al.")).replaceAll("");
        Matcher matcher7 = NORM_SUBGENUS.matcher(replaceAll3);
        if (matcher7.find()) {
            replaceAll3 = matcher7.replaceAll("$1 ($2) $3");
        }
        return StringUtils.trimToEmpty(normalize(replaceAll3));
    }

    @VisibleForTesting
    static String preClean(String str) {
        String unescapeUnicodeChars = org.gbif.utils.text.StringUtils.unescapeUnicodeChars(str);
        Matcher matcher = XML_ENTITY_STRIP.matcher(unescapeUnicodeChars);
        if (matcher.find()) {
            unescapeUnicodeChars = matcher.replaceAll("&$1;");
        }
        String trim = XML_TAGS.matcher(AMPERSAND_ENTITY.matcher(StringEscapeUtils.unescapeHtml4(unescapeUnicodeChars)).replaceAll("& ")).replaceAll("").trim();
        for (char c : QUOTES) {
            int i = 0;
            while (i < trim.length() && (c == trim.charAt(i) || Character.isWhitespace(trim.charAt(i)))) {
                i++;
            }
            if (i > 0) {
                int i2 = 0;
                while (c == trim.charAt((trim.length() - 1) - i2) && (trim.length() - i) - i2 > 0) {
                    i2++;
                }
                trim = trim.substring(i, trim.length() - i2);
            }
        }
        return StringUtils.trimToEmpty(NORM_WHITESPACE.matcher(trim).replaceAll(" "));
    }

    private void determineNameType(ParsedName parsedName, String str) {
        if (parsedName.getType() == null) {
            if (parsedName.getGenusOrAbove().equals(LocationInfo.NA)) {
                parsedName.setType(NameType.PLACEHOLDER);
                return;
            }
            if (!DOUBTFUL.matcher(str).find()) {
                parsedName.setType(NameType.DOUBTFUL);
            } else if (DOUBTFUL2.matcher(str).find()) {
                parsedName.setType(NameType.DOUBTFUL);
            } else {
                parsedName.setType(NameType.SCIENTIFIC);
            }
        }
    }

    private void postAssertParsing(ParsedName parsedName, String str, String str2) throws UnparsableException {
        if (parsedName.getGenusOrAbove() != null && !parsedName.isBinomial() && Character.isLowerCase(str2.charAt(0))) {
            throw new UnparsableException(NameType.DOUBTFUL, str);
        }
        if (parsedName.getRank() != null) {
            if (parsedName.getRank().equals(Rank.CULTIVAR) && parsedName.getCultivarEpithet() == null) {
                parsedName.setType(NameType.INFORMAL);
                return;
            }
            if (parsedName.getRank().isSpeciesOrBelow() && parsedName.getRank().isRestrictedToCode() != NomenclaturalCode.CULTIVARS && !parsedName.isBinomial()) {
                parsedName.setType(NameType.INFORMAL);
                return;
            }
            if (parsedName.getRank().isInfraspecific() && parsedName.getRank().isRestrictedToCode() != NomenclaturalCode.CULTIVARS && parsedName.getInfraSpecificEpithet() == null) {
                parsedName.setType(NameType.INFORMAL);
            } else {
                if (parsedName.getRank().isSpeciesOrBelow() || !parsedName.isBinomial()) {
                    return;
                }
                parsedName.setType(NameType.DOUBTFUL);
            }
        }
    }

    public NormalisedNameParser getNormalisedNameParser() {
        return this.nnParser;
    }

    static {
        QUOTES[0] = '\"';
        QUOTES[1] = '\'';
        QUOTES[2] = '\"';
        QUOTES[3] = '\'';
        HYBRID_FORMULA_PATTERN = Pattern.compile(" × ");
        EXTINCT_PATTERN = Pattern.compile("†\\s*");
        CULTIVAR = Pattern.compile("(?: cv\\.? ?)?[\"'] ?((?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ]?[a-zïëöüäåéèčáàæœ]+[- ]?){1,3}) ?[\"']");
        CULTIVAR_GROUP = Pattern.compile("(?<!^)\\b[\"']?((?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ][a-zïëöüäåéèčáàæœ]{2,}[- ]?){1,3})[\"']? (Group|Hybrids|Sort|[Gg]rex)\\b");
        STRAIN = Pattern.compile("([a-z]\\.?) +([A-Z]+ *[0-9]+T?)$");
        IS_VIRUS_PATTERN = Pattern.compile("virus(es)?\\b|\\b(viroid|(bacterio|viro)?phage(in|s)?|(alpha|beta) ?satellites?|particles?|ictv$)\\b", 2);
        IS_VIRUS_PATTERN_CASE_SENSITIVE = Pattern.compile("\\b(:?[MS]?NP|G)V\\b");
        IS_VIRUS_PATTERN_POSTFAIL = Pattern.compile("(\\b(vector)\\b)", 2);
        IS_GENE = Pattern.compile("(RNA|DNA)[0-9]*(?:\\b|_)");
        IS_CANDIDATUS_PATTERN = Pattern.compile(CANDIDATUS, 2);
        IS_CANDIDATUS_QUOTE_PATTERN = Pattern.compile("\"(Candidatus\\s|Ca\\.)\\s*(.+)\"", 2);
        RANK_MARKER_AT_END = Pattern.compile(" " + NormalisedNameParser.RANK_MARKER_ALL.substring(0, NormalisedNameParser.RANK_MARKER_ALL.lastIndexOf(41)) + "|" + NormalisedNameParser.RANK_MARKER_MICROBIAL.substring(3) + "\\.? ?(?:Ad|Lv)?\\.?$");
        EXTRACT_SENSU = Pattern.compile(",?\\s+((s\\.(?:l\\.|str\\.)|sensu\\s+(?:latu|strictu?)|(sec|sensu|auct|non)((\\.|\\s)(.*))?)$|\\((s\\.(?:l\\.|str\\.)|sensu\\s+(?:latu|strictu?)|(sec|sensu|auct|non)((\\.|\\s)(.*))?)\\))");
        NOV_RANK_MARKER = Pattern.compile("(fam|gen|sp|ssp|var|forma)");
        EXTRACT_NOMSTATUS = Pattern.compile("(?:, ?| )\\(?((?:comb|fam|gen|sp|ssp|var|forma)?[\\. ] ?nov[\\. $](?: ?ined\\.?)?|ined\\.|nom(?:\\s+|\\.\\s*|en\\s+)(?:utiq(?:ue\\s+|\\.\\s*))?(?:ambig|alter|alt|correct|cons|dubium|dub|herb|illeg|invalid|inval|negatum|neg|novum|nov|nudum|nud|oblitum|obl|praeoccup|prov|prot|transf|superfl|super|rejic|rej)\\.?(?:\\s+(?:prop|proposed)\\.?)?)\\)?");
        EXTRACT_REMARKS = Pattern.compile("\\s+(anon\\.?)(\\s.+)?$");
        EXTRACT_YEAR = Pattern.compile("([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?\\s*\\)?)");
        COMMA_BEFORE_YEAR = Pattern.compile("(,+|[^0-9\\(\\[\"])\\s*(\\d{3})");
        REPLACE_QUOTES = Pattern.compile("(^\\s*[\"',]+)|([\"',]+\\s*$)");
        NORM_QUOTES = Pattern.compile("([\"'`´]+)");
        NORM_UPPERCASE_WORDS = Pattern.compile("\\b(\\p{Lu})(\\p{Lu}{2,})\\b");
        NORM_WHITESPACE = Pattern.compile("\\s+");
        NORM_NO_SQUARE_BRACKETS = Pattern.compile("\\[(.*?)\\]");
        NORM_BRACKETS_OPEN = Pattern.compile("([{(\\[])\\s*,?");
        NORM_BRACKETS_CLOSE = Pattern.compile(",?\\s*([})\\]])");
        NORM_BRACKETS_OPEN_STRONG = Pattern.compile("( ?[{(\\[] ?)+");
        NORM_BRACKETS_CLOSE_STRONG = Pattern.compile("( ?[})\\]] ?)+");
        NORM_AND = Pattern.compile(" (and|et|und) ");
        NORM_ET_AL = Pattern.compile("(?:& )+al\\.?");
        NORM_AMPERSAND_WS = Pattern.compile("&");
        NORM_HYPHENS = Pattern.compile("\\s*-\\s*");
        NORM_SUBGENUS = Pattern.compile("([A-ZÏËÖÜÄÉÈČÁÀÆŒ](?:\\.|[a-zïëöüäåéèčáàæœ]+)(?:-[A-ZÏËÖÜÄÉÈČÁÀÆŒ]?[a-zïëöüäåéèčáàæœ]+)?) ([A-ZÏËÖÜÄÉÈČÁÀÆŒ](?:\\.|[a-zïëöüäåéèčáàæœ]+)(?:-[A-ZÏËÖÜÄÉÈČÁÀÆŒ]?[a-zïëöüäåéèčáàæœ]+)?) ([a-zïëöüäåéèčáàæœ+-]{5,})");
        NO_Q_MARKS = Pattern.compile("([a-zïëöüäåéèčáàæœ\\p{Ll}-])\\?+");
        NORM_COMMAS = Pattern.compile("\\s*,+");
        NORM_ORIG_AUTH = Pattern.compile("(?<=[ \\(])((?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*) ?\\( ?([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)\\)");
        NORM_ORIG_AUTH2 = Pattern.compile("\\(((?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*)\\) ?,? ?([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)");
        NORM_IMPRINT_YEAR = Pattern.compile("([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)\\s*(?:\\(\"?[\\s0-9-_,?]+\"?\\)|\\[\"?[0-9 -,]+\"?\\]|\"[0-9 -,]+\")");
        NORM_HYBRIDS_GENUS = Pattern.compile("^\\s*(?:[+×xX]|√ó)\\s*([A-ZÏËÖÜÄÉÈČÁÀÆŒ])");
        NORM_HYBRIDS_EPITH = Pattern.compile("^\\s*(×?[A-ZÏËÖÜÄÉÈČÁÀÆŒ](?:\\.|[a-zïëöüäåéèčáàæœ]+)(?:-[A-ZÏËÖÜÄÉÈČÁÀÆŒ]?[a-zïëöüäåéèčáàæœ]+)?)\\s+(?:×|√ó|[xX]\\s)\\s*((?:[0-9]+-?|[doml]')?(?:(?:van|novae) [a-z])?[a-zïëöüäåéèčáàæœ+-]{1,}(?<! d)[a-zïëöüäåéèčáàæœ](?<!(?:\\bex|bacilliform|coliform|coryneform|cytoform|chemoform|biovar|serovar|genomovar|agamovar|cultivar|genotype|serotype|subtype|ribotype|isolate))(?=\\b))");
        NORM_HYBRIDS_FORM = Pattern.compile(" ([×xX]|√ó) ");
        NORM_INDET = Pattern.compile("((^| )(undet|indet|aff|cf)[#!?\\.]?)+(?![a-z])");
        NORM_DOTS = Pattern.compile("(^\\s*[A-ZÏËÖÜÄÉÈČÁÀÆŒ]|" + NormalisedNameParser.RANK_MARKER_ALL + ")\\.");
        NORM_TF_GENUS = Pattern.compile("^([A-ZÏËÖÜÄÉÈČÁÀÆŒ])\\(([a-zïëöüäåéèčáàæœ-]+)\\)\\.? ");
        NORM_IN_COMMA = Pattern.compile(", in ", 2);
        NORM_IN_BIB = Pattern.compile("( in .+$| ?: ?[0-9]+)", 2);
        NORM_PREFIXES = Pattern.compile("^(sub)?(fossil|" + StringUtils.join(RankUtils.RANK_MARKER_MAP_SUPRAGENERIC.keySet(), "|") + ")\\.?\\s+", 2);
        NORM_SUFFIXES = Pattern.compile("[,;:]? (sp|anon|spp|hort|ms|&|[a-zA-Z][0-9])?\\.? *$", 2);
        NO_LETTERS = Pattern.compile("^[^a-zA-Z]+$");
        REMOVE_PLACEHOLDER_AUTHOR = Pattern.compile("\\b(?:unknown|unspecified|uncertain|\\?)[, ] ?([12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)$", 2);
        REMOVE_PLACEHOLDER_INFRAGENERIC = Pattern.compile("\\b\\( ?(?:unnamed|mixed|unassigned|unallocated|unplaced|undetermined|unclassified|uncultured|unknown|unspecified|uncertain|incertae sedis|not assigned|awaiting allocation|temp|dummy) ?\\) ", 2);
        PLACEHOLDER = Pattern.compile("\\b(?:unnamed|mixed|unassigned|unallocated|unplaced|undetermined|unclassified|uncultured|unknown|unspecified|uncertain|incertae sedis|not assigned|awaiting allocation|temp|dummy)\\b", 2);
        DOUBTFUL = Pattern.compile("^[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}a-zïëöüäåéèčáàæœ\\p{Ll}-×&*+ ,.()/'`´0-9-]+$");
        DOUBTFUL2 = Pattern.compile("\\bnull\\b");
        BAD_NAME_SUFFICES = Pattern.compile(" (author|unknown|unassigned|not_stated)$", 2);
        XML_ENTITY_STRIP = Pattern.compile("&\\s*([a-z]+)\\s*;");
        AMPERSAND_ENTITY = Pattern.compile("& *amp +");
        XML_TAGS = Pattern.compile("< */? *[a-zA-Z] *>");
        FIRST_WORD = Pattern.compile("^([×xX]\\s+)?([×x][A-Z])?([a-zA-Z])([a-zA-Z]+) ");
        NORM_WEIRD_CHARS = Pattern.compile(WEIRD_CHARS);
        FORM_SPECIALIS = Pattern.compile("\\bf\\.sp(?:ec)?\\b");
        SENSU_LATU = Pattern.compile("\\bs\\.l\\.\\b");
        StringBuilder sb = new StringBuilder();
        sb.append("\\b(");
        for (Rank rank : RankUtils.INFRASUBSPECIFIC_MICROBIAL_RANKS) {
            if (rank.name().endsWith("VAR")) {
                if (sb.length() > 4) {
                    sb.append("|");
                }
                sb.append(rank.name().toLowerCase().substring(0, rank.name().length() - 3));
            }
        }
        sb.append(")type\\b");
        TYPE_TO_VAR = Pattern.compile(sb.toString());
        COMB_BAS_AUTHOR_SWAP = Pattern.compile("( (?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*)(?:( ?,? ?[12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?))? ?\\(( ?(?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)?(?:(?: ?ex\\.? | & | et | in |, ?|; ?|\\.)(?:(?:(?:(?:[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]{1,3}\\.?[ -]?){0,3}|[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}][a-zïëöüäåéèčáàæœ\\p{Ll}-?]{3,} )?(?:[vV](?:an)(?:[ -](?:den|der) )? ?|von[ -](?:den |der |dem )?|(?:del|de|di|da)[`' _]|(?:Des|De|Di|N)[`' _]?|(?:de )?(?:la|le) |d'|D'|Mac|Mc|Le|St\\.? ?|Ou|O')?(?:v\\. )?[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?(?:(?:[- ](?:de|da|du)?[- ]?)[A-ZÏËÖÜÄÉÈČÁÀÆŒ\\p{Lu}]+[a-zïëöüäåéèčáàæœ\\p{Ll}-?]*\\.?)?(?: ?(?:f|fil|j|jr|jun|junior|sr|sen|senior|ms)\\.?)?(?: *: *(?:Pers|Fr)\\.?)?)|al\\.?))*)( ?,? ?[12][0-9][0-9][0-9?][abcdh?]?(?:[/-][0-9]{1,4})?)?\\)");
    }
}
