001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openstreetmap.josm.data.validation.routines; 018 019import static org.openstreetmap.josm.tools.I18n.tr; 020 021import java.net.URI; 022import java.net.URISyntaxException; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.Locale; 026import java.util.Set; 027import java.util.regex.Matcher; 028import java.util.regex.Pattern; 029 030import org.openstreetmap.josm.Main; 031 032/** 033 * <p><b>URL Validation</b> routines.</p> 034 * Behavior of validation is modified by passing in options: 035 * <ul> 036 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 037 * component.</li> 038 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 039 * included then fragments are flagged as illegal.</li> 040 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 041 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 042 * </ul> 043 * 044 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 045 * http://javascript.internet.com. However, this validation now bears little resemblance 046 * to the php original.</p> 047 * <pre> 048 * Example of usage: 049 * Construct a UrlValidator with valid schemes of "http", and "https". 050 * 051 * String[] schemes = {"http","https"}. 052 * UrlValidator urlValidator = new UrlValidator(schemes); 053 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 054 * System.out.println("url is valid"); 055 * } else { 056 * System.out.println("url is invalid"); 057 * } 058 * 059 * prints "url is invalid" 060 * If instead the default constructor is used. 061 * 062 * UrlValidator urlValidator = new UrlValidator(); 063 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 064 * System.out.println("url is valid"); 065 * } else { 066 * System.out.println("url is invalid"); 067 * } 068 * 069 * prints out "url is valid" 070 * </pre> 071 * 072 * @version $Revision: 1741724 $ 073 * @see 074 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 075 * Uniform Resource Identifiers (URI): Generic Syntax 076 * </a> 077 * 078 * @since Validator 1.4 079 */ 080public class UrlValidator extends AbstractValidator { 081 082 /** 083 * Allows all validly formatted schemes to pass validation instead of 084 * supplying a set of valid schemes. 085 */ 086 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 087 088 /** 089 * Allow two slashes in the path component of the URL. 090 */ 091 public static final long ALLOW_2_SLASHES = 1 << 1; 092 093 /** 094 * Enabling this options disallows any URL fragments. 095 */ 096 public static final long NO_FRAGMENTS = 1 << 2; 097 098 /** 099 * Allow local URLs, such as http://localhost/ or http://machine/ . 100 * This enables a broad-brush check, for complex local machine name 101 * validation requirements you should create your validator with 102 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 103 */ 104 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 105 106 /** 107 * This expression derived/taken from the BNF for URI (RFC2396). 108 */ 109 private static final String URL_REGEX = 110 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; 111 // 12 3 4 5 6 7 8 9 112 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX); 113 114 /** 115 * Schema/Protocol (ie. http:, ftp:, file:, etc). 116 */ 117 private static final int PARSE_URL_SCHEME = 2; 118 119 /** 120 * Includes hostname/ip and port number. 121 */ 122 private static final int PARSE_URL_AUTHORITY = 4; 123 124 private static final int PARSE_URL_PATH = 5; 125 126 private static final int PARSE_URL_QUERY = 7; 127 128 private static final int PARSE_URL_FRAGMENT = 9; 129 130 /** 131 * Protocol scheme (e.g. http, ftp, https). 132 */ 133 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 134 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 135 136 // Drop numeric, and "+-." for now 137 // TODO does not allow for optional userinfo. 138 // Validation of character set is done by isValidAuthority 139 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 140 private static final String IPV6_REGEX = "[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 141 142 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 143 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 144 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 145 // We assume that password has the same valid chars as user info 146 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 147 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 148 private static final String USERINFO_FIELD_REGEX = 149 USERINFO_CHARS_REGEX + "+:" + // At least one character for the name 150 USERINFO_CHARS_REGEX + "*@"; // password may be absent 151 private static final String AUTHORITY_REGEX = 152 "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(:\\d*)?(.*)?"; 153 // 1 e.g. user:pass@ 2 3 4 154 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 155 156 private static final int PARSE_AUTHORITY_IPV6 = 1; 157 158 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 159 160 /** 161 * Should always be empty. The code currently allows spaces. 162 */ 163 private static final int PARSE_AUTHORITY_EXTRA = 4; 164 165 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 166 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 167 168 private static final String QUERY_REGEX = "^(.*)$"; 169 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 170 171 /** 172 * Holds the set of current validation options. 173 */ 174 private final long options; 175 176 /** 177 * The set of schemes that are allowed to be in a URL. 178 */ 179 private final Set<String> allowedSchemes; // Must be lower-case 180 181 /** 182 * Regular expressions used to manually validate authorities if IANA 183 * domain name validation isn't desired. 184 */ 185 private final RegexValidator authorityValidator; 186 187 /** 188 * If no schemes are provided, default to this set. 189 */ 190 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 191 192 /** 193 * Singleton instance of this class with default schemes and options. 194 */ 195 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 196 197 /** 198 * Returns the singleton instance of this class with default schemes and options. 199 * @return singleton instance with default schemes and options 200 */ 201 public static UrlValidator getInstance() { 202 return DEFAULT_URL_VALIDATOR; 203 } 204 205 /** 206 * Create a UrlValidator with default properties. 207 */ 208 public UrlValidator() { 209 this((String[]) null); 210 } 211 212 /** 213 * Behavior of validation is modified by passing in several strings options: 214 * @param schemes Pass in one or more url schemes to consider valid, passing in 215 * a null will default to "http,https,ftp" being valid. 216 * If a non-null schemes is specified then all valid schemes must 217 * be specified. Setting the ALLOW_ALL_SCHEMES option will 218 * ignore the contents of schemes. 219 */ 220 public UrlValidator(String ... schemes) { 221 this(schemes, 0L); 222 } 223 224 /** 225 * Initialize a UrlValidator with the given validation options. 226 * @param options The options should be set using the public constants declared in 227 * this class. To set multiple options you simply add them together. For example, 228 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 229 */ 230 public UrlValidator(long options) { 231 this(null, null, options); 232 } 233 234 /** 235 * Behavior of validation is modified by passing in options: 236 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 237 * @param options The options should be set using the public constants declared in 238 * this class. To set multiple options you simply add them together. For example, 239 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 240 */ 241 public UrlValidator(String[] schemes, long options) { 242 this(schemes, null, options); 243 } 244 245 /** 246 * Initialize a UrlValidator with the given validation options. 247 * @param authorityValidator Regular expression validator used to validate the authority part 248 * This allows the user to override the standard set of domains. 249 * @param options Validation options. Set using the public constants of this class. 250 * To set multiple options, simply add them together: 251 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 252 * enables both of those options. 253 */ 254 public UrlValidator(RegexValidator authorityValidator, long options) { 255 this(null, authorityValidator, options); 256 } 257 258 /** 259 * Customizable constructor. Validation behavior is modifed by passing in options. 260 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 261 * @param authorityValidator Regular expression validator used to validate the authority part 262 * @param options Validation options. Set using the public constants of this class. 263 * To set multiple options, simply add them together: 264 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 265 * enables both of those options. 266 */ 267 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) { 268 this.options = options; 269 270 if (isOn(ALLOW_ALL_SCHEMES)) { 271 allowedSchemes = Collections.emptySet(); 272 } else { 273 if (schemes == null) { 274 schemes = DEFAULT_SCHEMES; 275 } 276 allowedSchemes = new HashSet<>(schemes.length); 277 for (int i = 0; i < schemes.length; i++) { 278 allowedSchemes.add(schemes[i].toLowerCase(Locale.ENGLISH)); 279 } 280 } 281 282 this.authorityValidator = authorityValidator; 283 } 284 285 /** 286 * <p>Checks if a field has a valid url address.</p> 287 * 288 * Note that the method calls #isValidAuthority() 289 * which checks that the domain is valid. 290 * 291 * @param value The value validation is being performed on. A <code>null</code> 292 * value is considered invalid. 293 * @return true if the url is valid. 294 */ 295 @Override 296 public boolean isValid(String value) { 297 if (value == null) { 298 return false; 299 } 300 301 // Check the whole url address structure 302 Matcher urlMatcher = URL_PATTERN.matcher(value); 303 if (!urlMatcher.matches()) { 304 setErrorMessage(tr("URL is invalid")); 305 return false; 306 } 307 308 String scheme = urlMatcher.group(PARSE_URL_SCHEME); 309 if (!isValidScheme(scheme)) { 310 setErrorMessage(tr("URL contains an invalid protocol: {0}", scheme)); 311 return false; 312 } 313 314 String authority = urlMatcher.group(PARSE_URL_AUTHORITY); 315 if ("file".equals(scheme)) { // Special case - file: allows an empty authority 316 if (!"".equals(authority) && authority.contains(":")) { // but cannot allow trailing : 317 setErrorMessage(tr("URL contains an invalid authority: {0}", authority)); 318 return false; 319 } 320 // drop through to continue validation 321 } else { // not file: 322 // Validate the authority 323 if (!isValidAuthority(authority)) { 324 setErrorMessage(tr("URL contains an invalid authority: {0}", authority)); 325 return false; 326 } 327 } 328 329 String path = urlMatcher.group(PARSE_URL_PATH); 330 if (!isValidPath(path)) { 331 setErrorMessage(tr("URL contains an invalid path: {0}", path)); 332 return false; 333 } 334 335 String query = urlMatcher.group(PARSE_URL_QUERY); 336 if (!isValidQuery(query)) { 337 setErrorMessage(tr("URL contains an invalid query: {0}", query)); 338 return false; 339 } 340 341 String fragment = urlMatcher.group(PARSE_URL_FRAGMENT); 342 if (!isValidFragment(fragment)) { 343 setErrorMessage(tr("URL contains an invalid fragment: {0}", fragment)); 344 return false; 345 } 346 347 return true; 348 } 349 350 @Override 351 public String getValidatorName() { 352 return tr("URL validator"); 353 } 354 355 /** 356 * Validate scheme. If schemes[] was initialized to a non null, 357 * then only those schemes are allowed. 358 * Otherwise the default schemes are "http", "https", "ftp". 359 * Matching is case-blind. 360 * @param scheme The scheme to validate. A <code>null</code> value is considered 361 * invalid. 362 * @return true if valid. 363 */ 364 protected boolean isValidScheme(String scheme) { 365 if (scheme == null) { 366 return false; 367 } 368 369 // TODO could be removed if external schemes were checked in the ctor before being stored 370 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 371 return false; 372 } 373 374 if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 375 return false; 376 } 377 378 return true; 379 } 380 381 /** 382 * Returns true if the authority is properly formatted. An authority is the combination 383 * of hostname and port. A <code>null</code> authority value is considered invalid. 384 * Note: this implementation validates the domain unless a RegexValidator was provided. 385 * If a RegexValidator was supplied and it matches, then the authority is regarded 386 * as valid with no further checks, otherwise the method checks against the 387 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 388 * @param authority Authority value to validate, alllows IDN 389 * @return true if authority (hostname and port) is valid. 390 */ 391 protected boolean isValidAuthority(String authority) { 392 if (authority == null) { 393 return false; 394 } 395 396 // check manual authority validation if specified 397 if (authorityValidator != null && authorityValidator.isValid(authority)) { 398 return true; 399 } 400 // convert to ASCII if possible 401 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 402 403 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 404 if (!authorityMatcher.matches()) { 405 return false; 406 } 407 408 // We have to process IPV6 separately because that is parsed in a different group 409 String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 410 if (ipv6 != null) { 411 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 412 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 413 return false; 414 } 415 } else { 416 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 417 // check if authority is hostname or IP address: 418 // try a hostname first since that's much more likely 419 DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS)); 420 if (!domainValidator.isValid(hostLocation)) { 421 // try an IPv4 address 422 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 423 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 424 // isn't IPv4, so the URL is invalid 425 return false; 426 } 427 } 428 } 429 430 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 431 if (extra != null && !extra.trim().isEmpty()) { 432 return false; 433 } 434 435 return true; 436 } 437 438 /** 439 * Returns true if the path is valid. A <code>null</code> value is considered invalid. 440 * @param path Path value to validate. 441 * @return true if path is valid. 442 */ 443 protected boolean isValidPath(String path) { 444 if (path == null) { 445 return false; 446 } 447 448 if (!PATH_PATTERN.matcher(path).matches()) { 449 return false; 450 } 451 452 try { 453 URI uri = new URI(null, null, path, null); 454 String norm = uri.normalize().getPath(); 455 if (norm.startsWith("/../") // Trying to go via the parent dir 456 || "/..".equals(norm)) { // Trying to go to the parent dir 457 return false; 458 } 459 } catch (URISyntaxException e) { 460 Main.trace(e); 461 return false; 462 } 463 464 int slash2Count = countToken("//", path); 465 if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) { 466 return false; 467 } 468 469 return true; 470 } 471 472 /** 473 * Returns true if the query is null or it's a properly formatted query string. 474 * @param query Query value to validate. 475 * @return true if query is valid. 476 */ 477 protected boolean isValidQuery(String query) { 478 if (query == null) { 479 return true; 480 } 481 482 return QUERY_PATTERN.matcher(query).matches(); 483 } 484 485 /** 486 * Returns true if the given fragment is null or fragments are allowed. 487 * @param fragment Fragment value to validate. 488 * @return true if fragment is valid. 489 */ 490 protected boolean isValidFragment(String fragment) { 491 if (fragment == null) { 492 return true; 493 } 494 495 return isOff(NO_FRAGMENTS); 496 } 497 498 /** 499 * Returns the number of times the token appears in the target. 500 * @param token Token value to be counted. 501 * @param target Target value to count tokens in. 502 * @return the number of tokens. 503 */ 504 protected int countToken(String token, String target) { 505 int tokenIndex = 0; 506 int count = 0; 507 while (tokenIndex != -1) { 508 tokenIndex = target.indexOf(token, tokenIndex); 509 if (tokenIndex > -1) { 510 tokenIndex++; 511 count++; 512 } 513 } 514 return count; 515 } 516 517 /** 518 * Tests whether the given flag is on. If the flag is not a power of 2 519 * (ie. 3) this tests whether the combination of flags is on. 520 * 521 * @param flag Flag value to check. 522 * 523 * @return whether the specified flag value is on. 524 */ 525 private boolean isOn(long flag) { 526 return (options & flag) > 0; 527 } 528 529 /** 530 * Tests whether the given flag is off. If the flag is not a power of 2 531 * (ie. 3) this tests whether the combination of flags is off. 532 * 533 * @param flag Flag value to check. 534 * 535 * @return whether the specified flag value is off. 536 */ 537 private boolean isOff(long flag) { 538 return (options & flag) == 0; 539 } 540}