Fix filename and URI handling

Many strings are passed to the library that could be either URIs or filesystem paths. We now assume that strings are a URI if they contain the substring "://". This means that they have a scheme and an authority. Otherwise, URI resolution wouldn't make much sense. Fix xmlBuildURI to work with filesystem paths. If the base URI doesn't contain "://" it is treated as filename. The resolved URI is unescaped, appended and the result is normalized. Rewrite xmlNormalizePath to handle Windows quirks. All special handling for Windows paths is removed in xmlCanonicPath. If the path looks like an URI, only escape characters allowed in Legacy Extended IRIs. Make xmlPathToURI only call xmlCanonicPath. Theh additional round-trip through URI parser and serializer seems useless. Add a helper function xmlConvertUriToPath in xmlIO.c which checks for file URIs and unescapes them. Always process strings with xmlCanonicPath in xmlLoadExternalEntity. This should be harmless now. Should help with #334, #387, #611.
2025-03-20 18:50:08 +03:00 · 2023-12-23 00:00:15 +01:00 · 2023-12-23 00:00:15 +01:00 · 8ab1b122c4
commit 8ab1b122c4
parent 28913232f6
3 changed files with 401 additions and 544 deletions
--- a/runtest.c
+++ b/runtest.c
@ -1070,6 +1070,16 @@ entityDeclDebug(void *ctx, const xmlChar *name, int type,
    xmlEntityPtr ent;
    const xmlChar *nullstr = BAD_CAST "(null)";

+    ent = xmlNewEntity(NULL, name, type, publicId, systemId, content);
+    if (systemId != NULL)
+        ent->URI = xmlBuildURI(systemId, (const xmlChar *) ctxt->filename);
+
+    if ((type == XML_INTERNAL_PARAMETER_ENTITY) ||
+        (type == XML_EXTERNAL_PARAMETER_ENTITY))
+        xmlHashAddEntry(ctxt->parameterEntities, name, ent);
+    else
+        xmlHashAddEntry(ctxt->generalEntities, name, ent);
+
    /* not all libraries handle printing null pointers nicely */
    if (publicId == NULL)
        publicId = nullstr;
@ -1082,16 +1092,6 @@ entityDeclDebug(void *ctx, const xmlChar *name, int type,
 	return;
    fprintf(SAXdebug, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
            name, type, publicId, systemId, content);
-
-    ent = xmlNewEntity(NULL, name, type, publicId, systemId, content);
-    if (systemId != NULL)
-        ent->URI = xmlBuildURI(systemId, (const xmlChar *) ctxt->filename);
-
-    if ((type == XML_INTERNAL_PARAMETER_ENTITY) ||
-        (type == XML_EXTERNAL_PARAMETER_ENTITY))
-        xmlHashAddEntry(ctxt->parameterEntities, name, ent);
-    else
-        xmlHashAddEntry(ctxt->generalEntities, name, ent);
 }

 /**
--- a/uri.c
+++ b/uri.c
@ -1460,6 +1460,120 @@ xmlFreeURI(xmlURIPtr uri) {
 *									*
 ************************************************************************/

+static int
+xmlIsPathSeparator(int c, int isFile) {
+    (void) isFile;
+
+    if (c == '/')
+        return(1);
+
+#ifdef _WIN32
+    if (isFile && (c == '\\'))
+        return(1);
+#endif
+
+    return(0);
+}
+
+/**
+ * xmlNormalizePath:
+ * @path:  pointer to the path string
+ * @isFile:  true for filesystem paths, false for URIs
+ *
+ * Normalize a filesystem path or URI.
+ *
+ * Returns 0 or an error code
+ */
+static int
+xmlNormalizePath(char *path, int isFile) {
+    char *cur, *out;
+    int numSeg = 0;
+
+    if (path == NULL)
+	return(-1);
+
+    cur = path;
+    out = path;
+
+    if (*cur == 0)
+        return(0);
+
+    if (xmlIsPathSeparator(*cur, isFile)) {
+        cur++;
+        *out++ = '/';
+    }
+
+    while (*cur != 0) {
+        /*
+         * At this point, out is either empty or ends with a separator.
+         * Collapse multiple separators first.
+         */
+        while (xmlIsPathSeparator(*cur, isFile)) {
+#ifdef _WIN32
+            /* Allow two separators at start of path */
+            if ((isFile) && (out == path + 1))
+                *out++ = '/';
+#endif
+            cur++;
+        }
+
+        if (*cur == '.') {
+            if (cur[1] == 0) {
+                /* Ignore "." at end of path */
+                break;
+            } else if (xmlIsPathSeparator(cur[1], isFile)) {
+                /* Skip "./" */
+                cur += 2;
+                continue;
+            } else if ((cur[1] == '.') &&
+                       ((cur[2] == 0) || xmlIsPathSeparator(cur[2], isFile))) {
+                if (numSeg > 0) {
+                    /* Handle ".." by removing last segment */
+                    do {
+                        out--;
+                    } while ((out > path) &&
+                             !xmlIsPathSeparator(out[-1], isFile));
+                    numSeg--;
+
+                    if (cur[2] == 0)
+                        break;
+                    cur += 3;
+                    continue;
+                } else if (out[0] == '/') {
+                    /* Ignore extraneous ".." in absolute paths */
+                    if (cur[2] == 0)
+                        break;
+                    cur += 3;
+                    continue;
+                } else {
+                    /* Keep "../" at start of relative path */
+                    numSeg--;
+                }
+            }
+        }
+
+        /* Copy segment */
+        while ((*cur != 0) && !xmlIsPathSeparator(*cur, isFile)) {
+            *out++ = *cur++;
+        }
+
+        /* Copy separator */
+        if (*cur != 0) {
+            cur++;
+            *out++ = '/';
+        }
+
+        numSeg++;
+    }
+
+    /* Keep "." if output is empty and it's a file */
+    if ((isFile) && (out <= path))
+        *out++ = '.';
+    *out = 0;
+
+    return(0);
+}
+
 /**
 * xmlNormalizeURIPath:
 * @path:  pointer to the path string
@ -1473,180 +1587,7 @@ xmlFreeURI(xmlURIPtr uri) {
 */
 int
 xmlNormalizeURIPath(char *path) {
-    char *cur, *out;
-
-    if (path == NULL)
-	return(-1);
-
-    /* Skip all initial "/" chars.  We want to get to the beginning of the
-     * first non-empty segment.
-     */
-    cur = path;
-    while (cur[0] == '/')
-      ++cur;
-    if (cur[0] == '\0')
-      return(0);
-
-    /* Keep everything we've seen so far.  */
-    out = cur;
-
-    /*
-     * Analyze each segment in sequence for cases (c) and (d).
-     */
-    while (cur[0] != '\0') {
-	/*
-	 * c) All occurrences of "./", where "." is a complete path segment,
-	 *    are removed from the buffer string.
-	 */
-	if ((cur[0] == '.') && (cur[1] == '/')) {
-	    cur += 2;
-	    /* '//' normalization should be done at this point too */
-	    while (cur[0] == '/')
-		cur++;
-	    continue;
-	}
-
-	/*
-	 * d) If the buffer string ends with "." as a complete path segment,
-	 *    that "." is removed.
-	 */
-	if ((cur[0] == '.') && (cur[1] == '\0'))
-	    break;
-
-	/* Otherwise keep the segment.  */
-	while (cur[0] != '/') {
-            if (cur[0] == '\0')
-              goto done_cd;
-	    (out++)[0] = (cur++)[0];
-	}
-	/* normalize // */
-	while ((cur[0] == '/') && (cur[1] == '/'))
-	    cur++;
-
-        (out++)[0] = (cur++)[0];
-    }
- done_cd:
-    out[0] = '\0';
-
-    /* Reset to the beginning of the first segment for the next sequence.  */
-    cur = path;
-    while (cur[0] == '/')
-      ++cur;
-    if (cur[0] == '\0')
-	return(0);
-
-    /*
-     * Analyze each segment in sequence for cases (e) and (f).
-     *
-     * e) All occurrences of "<segment>/../", where <segment> is a
-     *    complete path segment not equal to "..", are removed from the
-     *    buffer string.  Removal of these path segments is performed
-     *    iteratively, removing the leftmost matching pattern on each
-     *    iteration, until no matching pattern remains.
-     *
-     * f) If the buffer string ends with "<segment>/..", where <segment>
-     *    is a complete path segment not equal to "..", that
-     *    "<segment>/.." is removed.
-     *
-     * To satisfy the "iterative" clause in (e), we need to collapse the
-     * string every time we find something that needs to be removed.  Thus,
-     * we don't need to keep two pointers into the string: we only need a
-     * "current position" pointer.
-     */
-    while (1) {
-        char *segp, *tmp;
-
-        /* At the beginning of each iteration of this loop, "cur" points to
-         * the first character of the segment we want to examine.
-         */
-
-        /* Find the end of the current segment.  */
-        segp = cur;
-        while ((segp[0] != '/') && (segp[0] != '\0'))
-          ++segp;
-
-        /* If this is the last segment, we're done (we need at least two
-         * segments to meet the criteria for the (e) and (f) cases).
-         */
-        if (segp[0] == '\0')
-          break;
-
-        /* If the first segment is "..", or if the next segment _isn't_ "..",
-         * keep this segment and try the next one.
-         */
-        ++segp;
-        if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur+3))
-            || ((segp[0] != '.') || (segp[1] != '.')
-                || ((segp[2] != '/') && (segp[2] != '\0')))) {
-          cur = segp;
-          continue;
-        }
-
-        /* If we get here, remove this segment and the next one and back up
-         * to the previous segment (if there is one), to implement the
-         * "iteratively" clause.  It's pretty much impossible to back up
-         * while maintaining two pointers into the buffer, so just compact
-         * the whole buffer now.
-         */
-
-        /* If this is the end of the buffer, we're done.  */
-        if (segp[2] == '\0') {
-          cur[0] = '\0';
-          break;
-        }
-        /* Valgrind complained, strcpy(cur, segp + 3); */
-        /* string will overlap, do not use strcpy */
-        tmp = cur;
-        segp += 3;
-        while ((*tmp++ = *segp++) != 0)
-          ;
-
-        /* If there are no previous segments, then keep going from here.  */
-        segp = cur;
-        while ((segp > path) && ((--segp)[0] == '/'))
-          ;
-        if (segp == path)
-          continue;
-
-        /* "segp" is pointing to the end of a previous segment; find it's
-         * start.  We need to back up to the previous segment and start
-         * over with that to handle things like "foo/bar/../..".  If we
-         * don't do this, then on the first pass we'll remove the "bar/..",
-         * but be pointing at the second ".." so we won't realize we can also
-         * remove the "foo/..".
-         */
-        cur = segp;
-        while ((cur > path) && (cur[-1] != '/'))
-          --cur;
-    }
-    out[0] = '\0';
-
-    /*
-     * g) If the resulting buffer string still begins with one or more
-     *    complete path segments of "..", then the reference is
-     *    considered to be in error. Implementations may handle this
-     *    error by retaining these components in the resolved path (i.e.,
-     *    treating them as part of the final URI), by removing them from
-     *    the resolved path (i.e., discarding relative levels above the
-     *    root), or by avoiding traversal of the reference.
-     *
-     * We discard them from the final path.
-     */
-    if (path[0] == '/') {
-      cur = path;
-      while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.')
-             && ((cur[3] == '/') || (cur[3] == '\0')))
-	cur += 3;
-
-      if (cur != path) {
-	out = path;
-	while (cur[0] != '\0')
-          (out++)[0] = (cur++)[0];
-	out[0] = 0;
-      }
-    }
-
-    return(0);
+    return(xmlNormalizePath(path, 0));
 }

 static int is_hex(char c) {
@ -1724,8 +1665,9 @@ xmlURIUnescapeString(const char *str, int len, char *target) {
 * @str:  string to escape
 * @list: exception list string of chars not to escape
 *
- * This routine escapes a string to hex, ignoring reserved characters
- * (a-z, A-Z, 0-9, "@-_.!~*'()") and the characters in the exception list.
+ * This routine escapes a string to hex, ignoring unreserved characters
+ * a-z, A-Z, 0-9, "-._~", a few sub-delims "!*'()", the gen-delim "@"
+ * (why?) and the characters in the exception list.
 *
 * Returns a new escaped string or NULL in case of error.
 */
@ -1925,8 +1867,111 @@ xmlURIEscape(const xmlChar * str)
 *									*
 ************************************************************************/

+static int
+xmlIsAbsolutePath(const xmlChar *path) {
+    int c = path[0];
+
+    if (xmlIsPathSeparator(c, 1))
+        return(1);
+
+#ifdef _WIN32
+    if ((((c >= 'A') && (c <= 'Z')) ||
+         ((c >= 'a') && (c <= 'z'))) &&
+        (path[1] == ':'))
+        return(1);
+#endif
+
+    return(0);
+}
+
 /**
- * xmlBuildURISafe:
+ * xmlResolvePath:
+ * @ref:  the filesystem path
+ * @base:  the base value
+ * @out:  pointer to result URI
+ *
+ * Resolves a filesystem path from a base path.
+ *
+ * Returns 0 on success, -1 if a memory allocation failed or an error
+ * code if URI or base are invalid.
+ */
+static int
+xmlResolvePath(const xmlChar *escRef, const xmlChar *base, xmlChar **out) {
+    char *result = NULL;
+    xmlChar *ref = NULL;
+    int ret = -1;
+    int i;
+
+    if (out == NULL)
+        return(1);
+    *out = NULL;
+
+    if ((escRef == NULL) || (escRef[0] == 0)) {
+        if ((base == NULL) || (base[0] == 0))
+            return(1);
+        ref = xmlStrdup(base);
+        if (ref == NULL)
+            goto err_memory;
+        *out = ref;
+        return(0);
+    }
+
+    /*
+     * If a URI is resolved, we can assume it is a valid URI and not
+     * a filesystem path. This means we have to unescape it.
+     */
+    ref = (xmlChar *) xmlURIUnescapeString((char *) escRef, -1, NULL);
+    if (ref == NULL)
+        goto err_memory;
+
+    if ((base == NULL) || (base[0] == 0))
+        goto done;
+
+    if (xmlIsAbsolutePath(ref))
+        goto done;
+
+    /*
+     * Remove last segment from base
+     */
+    i = xmlStrlen(base);
+    while ((i > 0) && !xmlIsPathSeparator(base[i-1], 1))
+        i--;
+
+    /*
+     * Concatenate base and ref
+     */
+    if (i > 0) {
+        int refLen = xmlStrlen(ref);
+
+        result = xmlMalloc(i + refLen + 1);
+        if (result == NULL)
+            goto err_memory;
+
+        memcpy(result, base, i);
+        memcpy(result + i, ref, refLen + 1);
+    }
+
+    /*
+     * Normalize
+     */
+    xmlNormalizePath(result, 1);
+
+done:
+    if (result == NULL) {
+        result = (char *) ref;
+        ref = NULL;
+    }
+
+    *out = (xmlChar *) result;
+    ret = 0;
+
+err_memory:
+    xmlFree(ref);
+    return(ret);
+}
+
+/**
+ * xmlBulidURISafe:
 * @URI:  the URI instance found in the document
 * @base:  the base value
 * @valPtr:  pointer to result URI
@ -1974,6 +2019,16 @@ xmlBuildURISafe(const xmlChar *URI, const xmlChar *base, xmlChar **valPtr) {
            ret = -1;
 	goto done;
    }
+
+    /*
+     * If base has no scheme or authority, it is assumed to be a
+     * filesystem path.
+     */
+    if (xmlStrstr(base, BAD_CAST "://") == NULL) {
+        xmlFreeURI(ref);
+        return(xmlResolvePath(URI, base, valPtr));
+    }
+
    ret = xmlParseURISafe((const char *) base, &bas);
    if (ret < 0)
        goto done;
@ -2565,157 +2620,41 @@ xmlBuildRelativeURI(const xmlChar * URI, const xmlChar * base)
 * xmlCanonicPath:
 * @path:  the resource locator in a filesystem notation
 *
- * Constructs a canonic path from the specified path.
+ * Prepares a path.
 *
- * Returns a new canonic path, or a duplicate of the path parameter if the
- * construction fails. The caller is responsible for freeing the memory occupied
+ * If the path contains the substring "://", it is considered a
+ * Legacy Extended IRI. Characters which aren't allowed in URIs are
+ * escaped.
+ *
+ * Otherwise, the path is considered a filesystem path which is
+ * copied without modification.
+ *
+ * The caller is responsible for freeing the memory occupied
 * by the returned string. If there is insufficient memory available, or the
 * argument is NULL, the function returns NULL.
 */
-#define IS_WINDOWS_PATH(p)					\
-	((p != NULL) &&						\
-	 (((p[0] >= 'a') && (p[0] <= 'z')) ||			\
-	  ((p[0] >= 'A') && (p[0] <= 'Z'))) &&			\
-	 (p[1] == ':') && ((p[2] == '/') || (p[2] == '\\')))
 xmlChar *
 xmlCanonicPath(const xmlChar *path)
 {
-/*
- * For Windows implementations, additional work needs to be done to
- * replace backslashes in pathnames with "forward slashes"
- */
-#if defined(_WIN32)
-    int len = 0;
-    char *p = NULL;
-#endif
-    xmlURIPtr uri;
    xmlChar *ret;
-    const xmlChar *absuri;
-    int res;

    if (path == NULL)
 	return(NULL);

-#if defined(_WIN32)
-    /*
-     * We must not change the backslashes to slashes if the the path
-     * starts with \\?\
-     * Those paths can be up to 32k characters long.
-     * Was added specifically for OpenOffice, those paths can't be converted
-     * to URIs anyway.
-     */
-    if ((path[0] == '\\') && (path[1] == '\\') && (path[2] == '?') &&
-        (path[3] == '\\') )
-	return xmlStrdup((const xmlChar *) path);
-#endif
-
-	/* sanitize filename starting with // so it can be used as URI */
-    if ((path[0] == '/') && (path[1] == '/') && (path[2] != '/'))
-        path++;
-
-    res = xmlParseURISafe((const char *) path, &uri);
-    if (res < 0)
-        return(NULL);
-    if (uri != NULL) {
-        xmlFreeURI(uri);
-	return xmlStrdup(path);
-    }
-
    /* Check if this is an "absolute uri" */
-    absuri = xmlStrstr(path, BAD_CAST "://");
-    if (absuri != NULL) {
-        int l, j;
-	unsigned char c;
-	xmlChar *escURI;
-
-        /*
-	 * this looks like an URI where some parts have not been
-	 * escaped leading to a parsing problem.  Check that the first
-	 * part matches a protocol.
-	 */
-	l = absuri - path;
-	/* Bypass if first part (part before the '://') is > 20 chars */
-	if ((l <= 0) || (l > 20))
-	    goto path_processing;
-	/* Bypass if any non-alpha characters are present in first part */
-	for (j = 0;j < l;j++) {
-	    c = path[j];
-	    if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'))))
-	        goto path_processing;
-	}
-
-	/* Escape all except the characters specified in the supplied path */
-        escURI = xmlURIEscapeStr(path, BAD_CAST ":/?_.#&;=");
-	if (escURI == NULL)
-            return(NULL);
-        /* Try parsing the escaped path */
-        res = xmlParseURISafe((const char *) escURI, &uri);
-        if (res < 0) {
-            xmlFree(escURI);
-            return(NULL);
-        }
-        /* If successful, return the escaped string */
-        if (uri != NULL) {
-            xmlFreeURI(uri);
-            return escURI;
-        }
-        xmlFree(escURI);
-    }
-
-path_processing:
-/* For Windows implementations, replace backslashes with 'forward slashes' */
-#if defined(_WIN32)
-    /*
-     * Create a URI structure
-     */
-    uri = xmlCreateURI();
-    if (uri == NULL) {		/* Guard against 'out of memory' */
-        return(NULL);
-    }
-
-    len = xmlStrlen(path);
-    if ((len > 2) && IS_WINDOWS_PATH(path)) {
-        /* make the scheme 'file' */
-	uri->scheme = (char *) xmlStrdup(BAD_CAST "file");
-	if (uri->scheme == NULL) {
-	    xmlFreeURI(uri);
-	    return(NULL);
-	}
-	/* allocate space for leading '/' + path + string terminator */
-	uri->path = xmlMallocAtomic(len + 2);
-	if (uri->path == NULL) {
-	    xmlFreeURI(uri);	/* Guard against 'out of memory' */
-	    return(NULL);
-	}
-	/* Put in leading '/' plus path */
-	uri->path[0] = '/';
-	p = uri->path + 1;
-	strncpy(p, (char *) path, len + 1);
+    if (xmlStrstr(path, BAD_CAST "://") != NULL) {
+	/*
+         * Escape all characters except reserved, unreserved and the
+         * percent sign.
+         *
+         * xmlURIEscapeStr already keeps unreserved characters, so we
+         * pass gen-delims, sub-delims and "%" to ignore.
+         */
+        ret = xmlURIEscapeStr(path, BAD_CAST ":/?#[]@!$&()*+,;='%");
    } else {
-	uri->path = (char *) xmlStrdup(path);
-	if (uri->path == NULL) {
-	    xmlFreeURI(uri);
-	    return(NULL);
-	}
-	p = uri->path;
-    }
-    /* Now change all occurrences of '\' to '/' */
-    while (*p != '\0') {
-	if (*p == '\\')
-	    *p = '/';
-	p++;
+        ret = xmlStrdup((const xmlChar *) path);
    }

-    if (uri->scheme == NULL) {
-	ret = xmlStrdup((const xmlChar *) uri->path);
-    } else {
-	ret = xmlSaveUri(uri);
-    }
-
-    xmlFreeURI(uri);
-#else
-    ret = xmlStrdup((const xmlChar *) path);
-#endif
    return(ret);
 }

@ -2733,46 +2672,5 @@ path_processing:
 xmlChar *
 xmlPathToURI(const xmlChar *path)
 {
-    xmlURIPtr uri;
-    xmlURI temp;
-    xmlChar *ret, *cal;
-    int res;
-
-    if (path == NULL)
-        return(NULL);
-
-    res = xmlParseURISafe((const char *) path, &uri);
-    if (res < 0)
-        return(NULL);
-    if (uri != NULL) {
-        xmlFreeURI(uri);
-	return xmlStrdup(path);
-    }
-
-    cal = xmlCanonicPath(path);
-    if (cal == NULL)
-        return(NULL);
-#if defined(_WIN32)
-    /* xmlCanonicPath can return an URI on Windows (is that the intended behaviour?)
-       If 'cal' is a valid URI already then we are done here, as continuing would make
-       it invalid. */
-    if ((uri = xmlParseURI((const char *) cal)) != NULL) {
-	xmlFreeURI(uri);
-	return cal;
-    }
-    /* 'cal' can contain a relative path with backslashes. If that is processed
-       by xmlSaveURI, they will be escaped and the external entity loader machinery
-       will fail. So convert them to slashes. Misuse 'ret' for walking. */
-    ret = cal;
-    while (*ret != '\0') {
-	if (*ret == '\\')
-	    *ret = '/';
-	ret++;
-    }
-#endif
-    memset(&temp, 0, sizeof(temp));
-    temp.path = (char *) cal;
-    ret = xmlSaveUri(&temp);
-    xmlFree(cal);
-    return(ret);
+    return(xmlCanonicPath(path));
 }
--- a/xmlIO.c
+++ b/xmlIO.c
@ -362,9 +362,6 @@ xmlCtxtErrIO(xmlParserCtxtPtr ctxt, int code, const char *uri)
        str2 = errstr;
    }

-    /*
-     * TODO: Set filename in error
-     */
    xmlCtxtErr(ctxt, NULL, XML_FROM_IO, code, level,
               (const xmlChar *) uri, NULL, NULL, 0,
               msg, str1, str2);
@ -388,6 +385,7 @@ static wchar_t *
 __xmlIOWin32UTF8ToWChar(const char *u8String)
 {
    wchar_t *wString = NULL;
+    int i;

    if (u8String) {
        int wLen =
@ -402,6 +400,14 @@ __xmlIOWin32UTF8ToWChar(const char *u8String)
                    wString = NULL;
                }
            }
+
+            /*
+             * Convert to backward slash
+             */
+            for (i = 0; wString[i] != 0; i++) {
+                if (wString[i] == '/')
+                    wString[i] = '\\';
+            }
        }
    }

@ -553,6 +559,37 @@ xmlCheckFilename (const char *path)
    return 1;
 }

+static int
+xmlConvertUriToPath(const char *uri, char **out) {
+    const char *escaped;
+    char *unescaped;
+
+    *out = NULL;
+
+    if (!xmlStrncasecmp(BAD_CAST uri, BAD_CAST "file://localhost/", 17)) {
+	escaped = &uri[16];
+    } else if (!xmlStrncasecmp(BAD_CAST uri, BAD_CAST "file:///", 8)) {
+	escaped = &uri[7];
+    } else if (!xmlStrncasecmp(BAD_CAST uri, BAD_CAST "file:/", 6)) {
+        /* lots of generators seems to lazy to read RFC 1738 */
+	escaped = &uri[5];
+    } else {
+        return(1);
+    }
+
+#ifdef _WIN32
+    /* Ignore slash like in file:///C:/file.txt */
+    escaped += 1;
+#endif
+
+    unescaped = xmlURIUnescapeString(escaped, 0, NULL);
+    if (unescaped == NULL)
+        return(-1);
+
+    *out = unescaped;
+    return(0);
+}
+
 /**
 * xmlFdOpen:
 * @filename:  the URI for matching
@ -562,8 +599,7 @@ xmlCheckFilename (const char *path)
 */
 static int
 xmlFdOpen(const char *filename, void **out) {
-    const char *escaped = NULL;
-    char *unescaped = NULL;
+    char *fromUri = NULL;
    int fd;
    int ret;

@ -571,45 +607,21 @@ xmlFdOpen(const char *filename, void **out) {
    if (filename == NULL)
        return(XML_ERR_ARGUMENT);

-    /*
-     * TODO: This should be moved to uri.c. We also need support for
-     * UNC paths on Windows.
-     */
-    if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file://localhost/", 17)) {
-#if defined (_WIN32)
-	escaped = &filename[17];
-#else
-	escaped = &filename[16];
-#endif
-    } else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:///", 8)) {
-#if defined (_WIN32)
-	escaped = &filename[8];
-#else
-	escaped = &filename[7];
-#endif
-    } else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:/", 6)) {
-        /* lots of generators seems to lazy to read RFC 1738 */
-#if defined (_WIN32)
-	escaped = &filename[6];
-#else
-	escaped = &filename[5];
-#endif
-    }
+    if (xmlConvertUriToPath(filename, &fromUri) < 0)
+        return(XML_ERR_NO_MEMORY);

-    if (escaped != NULL) {
-        unescaped = xmlURIUnescapeString(escaped, 0, NULL);
-        if (unescaped == NULL)
-            return(XML_ERR_NO_MEMORY);
-        filename = unescaped;
-    }
+    if (fromUri != NULL)
+        filename = fromUri;

 #if defined(_WIN32)
    {
        wchar_t *wpath;

        wpath = __xmlIOWin32UTF8ToWChar(filename);
-        if (wpath == NULL)
+        if (wpath == NULL) {
+            xmlFree(fromUri);
            return(XML_ERR_NO_MEMORY);
+        }
 	fd = _wopen(wpath, _O_RDONLY | _O_BINARY);
        xmlFree(wpath);
    }
@ -636,9 +648,7 @@ xmlFdOpen(const char *filename, void **out) {
        ret = XML_ERR_OK;
    }

-    if (unescaped != NULL)
-        xmlFree(unescaped);
-
+    xmlFree(fromUri);
    return(ret);
 }

@ -838,7 +848,7 @@ xmlFileOpen(const char *filename) {
 */
 static void *
 xmlFileOpenW (const char *filename) {
-    const char *path = NULL;
+    char *fromUri = NULL;
    FILE *fd;

    if (!strcmp(filename, "-")) {
@ -846,33 +856,24 @@ xmlFileOpenW (const char *filename) {
 	return((void *) fd);
    }

-    if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file://localhost/", 17))
-#if defined (_WIN32)
-	path = &filename[17];
-#else
-	path = &filename[16];
-#endif
-    else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:///", 8)) {
-#if defined (_WIN32)
-	path = &filename[8];
-#else
-	path = &filename[7];
-#endif
-    } else
-	path = filename;
+    if (xmlConvertUriToPath(filename, &fromUri) < 0)
+        return(NULL);

-    if (path == NULL)
-	return(NULL);
+    if (fromUri != NULL)
+        filename = fromUri;

 #if defined(_WIN32)
-    fd = xmlWrapOpenUtf8(path, 1);
+    fd = xmlWrapOpenUtf8(filename, 1);
 #elif(__MVS__)
-    fd = fopen(path, "w");
+    fd = fopen(filename, "w");
 #else
-    fd = fopen(path, "wb");
+    fd = fopen(filename, "wb");
 #endif /* WIN32 */

-    if (fd == NULL) xmlIOErr(0, path);
+    if (fd == NULL)
+        xmlIOErr(0, filename);
+
+    xmlFree(fromUri);
    return((void *) fd);
 }
 #endif /* LIBXML_OUTPUT_ENABLED */
@ -1025,34 +1026,25 @@ xmlGzfileMatch (const char *filename ATTRIBUTE_UNUSED) {
 */
 static void *
 xmlGzfileOpen_real (const char *filename) {
-    const char *path = NULL;
+    char *fromUri = NULL;
    gzFile fd;

-    if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file://localhost/", 17))
-#if defined (_WIN32)
-	path = &filename[17];
-#else
-	path = &filename[16];
-#endif
-    else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:///", 8)) {
-#if defined (_WIN32)
-	path = &filename[8];
-#else
-	path = &filename[7];
-#endif
-    } else
-	path = filename;
+    if (xmlConvertUriToPath(filename, &fromUri) < 0)
+        return(NULL);

-    if (path == NULL)
-	return(NULL);
-    if (!xmlCheckFilename(path))
+    if (fromUri != NULL)
+        filename = fromUri;
+
+    if (!xmlCheckFilename(filename))
        return(NULL);

 #if defined(_WIN32)
-    fd = xmlWrapGzOpenUtf8(path, "rb");
+    fd = xmlWrapGzOpenUtf8(filename, "rb");
 #else
-    fd = gzopen(path, "rb");
+    fd = gzopen(filename, "rb");
 #endif
+
+    xmlFree(fromUri);
    return((void *) fd);
 }

@ -1092,7 +1084,7 @@ xmlGzfileOpen (const char *filename) {
 */
 static void *
 xmlGzfileOpenW (const char *filename, int compression) {
-    const char *path = NULL;
+    char *fromUri = NULL;
    char mode[15];
    gzFile fd;

@ -1107,29 +1099,19 @@ xmlGzfileOpenW (const char *filename, int compression) {
 	return((void *) fd);
    }

-    if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file://localhost/", 17))
-#if defined (_WIN32)
-	path = &filename[17];
-#else
-	path = &filename[16];
-#endif
-    else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:///", 8)) {
-#if defined (_WIN32)
-	path = &filename[8];
-#else
-	path = &filename[7];
-#endif
-    } else
-	path = filename;
+    if (xmlConvertUriToPath(filename, &fromUri) < 0)
+        return(NULL);

-    if (path == NULL)
-	return(NULL);
+    if (fromUri != NULL)
+        filename = fromUri;

 #if defined(_WIN32)
-    fd = xmlWrapGzOpenUtf8(path, mode);
+    fd = xmlWrapGzOpenUtf8(filename, mode);
 #else
-    fd = gzopen(path, mode);
+    fd = gzopen(filename, mode);
 #endif
+
+    xmlFree(fromUri);
    return((void *) fd);
 }
 #endif /* LIBXML_OUTPUT_ENABLED */
@ -1221,25 +1203,23 @@ xmlXzfileMatch (const char *filename ATTRIBUTE_UNUSED) {
 */
 static void *
 xmlXzfileOpen_real (const char *filename) {
-    const char *path = NULL;
+    char *fromUri = NULL;
    xzFile fd;

-    if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file://localhost/", 17)) {
-	path = &filename[16];
-    } else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:///", 8)) {
-	path = &filename[7];
-    } else if (!xmlStrncasecmp(BAD_CAST filename, BAD_CAST "file:/", 6)) {
-        /* lots of generators seems to lazy to read RFC 1738 */
-	path = &filename[5];
-    } else
-	path = filename;
-
-    if (path == NULL)
-	return(NULL);
-    if (!xmlCheckFilename(path))
+    if (xmlConvertUriToPath(filename, &fromUri) < 0)
        return(NULL);

-    fd = __libxml2_xzopen(path, "rb");
+    if (fromUri != NULL)
+        filename = fromUri;
+
+    if (!xmlCheckFilename(filename)) {
+        xmlFree(fromUri);
+        return(NULL);
+    }
+
+    fd = __libxml2_xzopen(filename, "rb");
+
+    xmlFree(fromUri);
    return((void *) fd);
 }

@ -1955,36 +1935,37 @@ __xmlOutputBufferCreateFilename(const char *URI,
    char *unescaped = NULL;
    int is_file_uri = 1;

-    (void) compression;
-
-    if (URI == NULL) return(NULL);
+    if (URI == NULL)
+        return(NULL);

    puri = xmlParseURI(URI);
    if (puri != NULL) {
        if ((puri->scheme != NULL) &&
-	    (!xmlStrEqual(BAD_CAST puri->scheme, BAD_CAST "file")))
-	    is_file_uri = 0;
+           (!xmlStrEqual(BAD_CAST puri->scheme, BAD_CAST "file")))
+           is_file_uri = 0;

-	/*
-	 * try to limit the damages of the URI unescaping code.
-	 */
-	if ((puri->scheme == NULL) ||
-	    (xmlStrEqual(BAD_CAST puri->scheme, BAD_CAST "file"))) {
-	    unescaped = xmlURIUnescapeString(URI, 0, NULL);
-            if (unescaped == NULL) {
-                xmlFreeURI(puri);
-                return(NULL);
-            }
+        /*
+         * try to limit the damages of the URI unescaping code.
+         */
+        if (puri->scheme == NULL) {
+            unescaped = xmlURIUnescapeString(URI, 0, NULL);
+             if (unescaped == NULL) {
+                 xmlFreeURI(puri);
+                 return(NULL);
+             }
+             URI = unescaped;
        }
-	xmlFreeURI(puri);
+        xmlFreeURI(puri);
    }

    /*
     * Allocate the Output buffer front-end.
     */
    ret = xmlAllocOutputBufferInternal(encoder);
-    if (ret == NULL)
+    if (ret == NULL) {
+        xmlFree(unescaped);
        return(NULL);
+    }

    /*
     * Try to find one of the output accept method accepting that scheme
@ -1995,26 +1976,10 @@ __xmlOutputBufferCreateFilename(const char *URI,
        int code;

        if (cb->matchcallback == xmlIODefaultMatch) {
-            if (unescaped != NULL) {
-                code = xmlOutputDefaultOpen(ret, unescaped, compression,
-                                            is_file_uri);
-                /* TODO: Handle other errors */
-                if (code == XML_ERR_OK)
-                    break;
-            }
-
            code = xmlOutputDefaultOpen(ret, URI, compression, is_file_uri);
            /* TODO: Handle other errors */
            if (code == XML_ERR_OK)
                break;
-        } else if ((cb->matchcallback != NULL) &&
-                   (cb->matchcallback(unescaped) != 0)) {
-            ret->context = cb->opencallback(unescaped);
-            if (ret->context != NULL) {
-                ret->writecallback = cb->writecallback;
-                ret->closecallback = cb->closecallback;
-                break;
-            }
        } else if ((cb->matchcallback != NULL) &&
                   (cb->matchcallback(URI) != 0)) {
            ret->context = cb->opencallback(URI);
@ -2026,13 +1991,12 @@ __xmlOutputBufferCreateFilename(const char *URI,
        }
    }

-    xmlFree(unescaped);
-
    if (ret->context == NULL) {
        xmlOutputBufferClose(ret);
-	return(NULL);
+	ret = NULL;
    }

+    xmlFree(unescaped);
    return(ret);
 }

@ -3191,28 +3155,23 @@ xmlCheckHTTPInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr ret) {
    return(ret);
 }

-static int xmlNoNetExists(const char *URL) {
-    const char *path;
+static int xmlNoNetExists(const char *filename) {
+    char *fromUri;
+    int ret;

-    if (URL == NULL)
+    if (filename == NULL)
 	return(0);

-    if (!xmlStrncasecmp(BAD_CAST URL, BAD_CAST "file://localhost/", 17))
-#if defined (_WIN32)
-	path = &URL[17];
-#else
-	path = &URL[16];
-#endif
-    else if (!xmlStrncasecmp(BAD_CAST URL, BAD_CAST "file:///", 8)) {
-#if defined (_WIN32)
-	path = &URL[8];
-#else
-	path = &URL[7];
-#endif
-    } else
-	path = URL;
+    if (xmlConvertUriToPath(filename, &fromUri) < 0)
+        return(0);

-    return xmlCheckFilename(path);
+    if (fromUri != NULL)
+        filename = fromUri;
+
+    ret =  xmlCheckFilename(filename);
+
+    xmlFree(fromUri);
+    return(ret);
 }

 #ifdef LIBXML_CATALOG_ENABLED
@ -3374,21 +3333,21 @@ xmlGetExternalEntityLoader(void) {
 xmlParserInputPtr
 xmlLoadExternalEntity(const char *URL, const char *ID,
                      xmlParserCtxtPtr ctxt) {
-    if ((URL != NULL) && (xmlNoNetExists(URL) == 0)) {
-	char *canonicFilename;
-	xmlParserInputPtr ret;
+    char *canonicFilename;
+    xmlParserInputPtr ret;

-	canonicFilename = (char *) xmlCanonicPath((const xmlChar *) URL);
-	if (canonicFilename == NULL) {
-            xmlCtxtErrMemory(ctxt);
-	    return(NULL);
-	}
+    if (URL == NULL)
+        return(NULL);

-	ret = xmlCurrentExternalEntityLoader(canonicFilename, ID, ctxt);
-	xmlFree(canonicFilename);
-	return(ret);
+    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) URL);
+    if (canonicFilename == NULL) {
+        xmlCtxtErrMemory(ctxt);
+        return(NULL);
    }
-    return(xmlCurrentExternalEntityLoader(URL, ID, ctxt));
+
+    ret = xmlCurrentExternalEntityLoader(canonicFilename, ID, ctxt);
+    xmlFree(canonicFilename);
+    return(ret);
 }

 /************************************************************************