download.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. # The following comment should be removed at some point in the future.
  2. # mypy: disallow-untyped-defs=False
  3. from __future__ import absolute_import
  4. import cgi
  5. import logging
  6. import mimetypes
  7. import os
  8. import re
  9. import shutil
  10. import sys
  11. from pip._vendor import requests
  12. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
  13. from pip._vendor.six import PY2
  14. from pip._vendor.six.moves.urllib import parse as urllib_parse
  15. from pip._internal.exceptions import HashMismatch, InstallationError
  16. from pip._internal.models.index import PyPI
  17. from pip._internal.network.session import PipSession
  18. from pip._internal.utils.encoding import auto_decode
  19. from pip._internal.utils.filesystem import copy2_fixed
  20. from pip._internal.utils.misc import (
  21. ask_path_exists,
  22. backup_dir,
  23. consume,
  24. display_path,
  25. format_size,
  26. hide_url,
  27. path_to_display,
  28. rmtree,
  29. splitext,
  30. )
  31. from pip._internal.utils.temp_dir import TempDirectory
  32. from pip._internal.utils.typing import MYPY_CHECK_RUNNING
  33. from pip._internal.utils.ui import DownloadProgressProvider
  34. from pip._internal.utils.unpacking import unpack_file
  35. from pip._internal.utils.urls import get_url_scheme
  36. from pip._internal.vcs import vcs
  37. if MYPY_CHECK_RUNNING:
  38. from typing import (
  39. IO, Callable, List, Optional, Text, Tuple,
  40. )
  41. from mypy_extensions import TypedDict
  42. from pip._internal.models.link import Link
  43. from pip._internal.utils.hashes import Hashes
  44. from pip._internal.vcs.versioncontrol import VersionControl
  45. if PY2:
  46. CopytreeKwargs = TypedDict(
  47. 'CopytreeKwargs',
  48. {
  49. 'ignore': Callable[[str, List[str]], List[str]],
  50. 'symlinks': bool,
  51. },
  52. total=False,
  53. )
  54. else:
  55. CopytreeKwargs = TypedDict(
  56. 'CopytreeKwargs',
  57. {
  58. 'copy_function': Callable[[str, str], None],
  59. 'ignore': Callable[[str, List[str]], List[str]],
  60. 'ignore_dangling_symlinks': bool,
  61. 'symlinks': bool,
  62. },
  63. total=False,
  64. )
  65. __all__ = ['get_file_content',
  66. 'unpack_vcs_link',
  67. 'unpack_file_url',
  68. 'unpack_http_url', 'unpack_url',
  69. 'parse_content_disposition', 'sanitize_content_filename']
  70. logger = logging.getLogger(__name__)
  71. def get_file_content(url, comes_from=None, session=None):
  72. # type: (str, Optional[str], Optional[PipSession]) -> Tuple[str, Text]
  73. """Gets the content of a file; it may be a filename, file: URL, or
  74. http: URL. Returns (location, content). Content is unicode.
  75. :param url: File path or url.
  76. :param comes_from: Origin description of requirements.
  77. :param session: Instance of pip.download.PipSession.
  78. """
  79. if session is None:
  80. raise TypeError(
  81. "get_file_content() missing 1 required keyword argument: 'session'"
  82. )
  83. scheme = get_url_scheme(url)
  84. if scheme in ['http', 'https']:
  85. # FIXME: catch some errors
  86. resp = session.get(url)
  87. resp.raise_for_status()
  88. return resp.url, resp.text
  89. elif scheme == 'file':
  90. if comes_from and comes_from.startswith('http'):
  91. raise InstallationError(
  92. 'Requirements file %s references URL %s, which is local'
  93. % (comes_from, url))
  94. path = url.split(':', 1)[1]
  95. path = path.replace('\\', '/')
  96. match = _url_slash_drive_re.match(path)
  97. if match:
  98. path = match.group(1) + ':' + path.split('|', 1)[1]
  99. path = urllib_parse.unquote(path)
  100. if path.startswith('/'):
  101. path = '/' + path.lstrip('/')
  102. url = path
  103. try:
  104. with open(url, 'rb') as f:
  105. content = auto_decode(f.read())
  106. except IOError as exc:
  107. raise InstallationError(
  108. 'Could not open requirements file: %s' % str(exc)
  109. )
  110. return url, content
  111. _url_slash_drive_re = re.compile(r'/*([a-z])\|', re.I)
  112. def unpack_vcs_link(link, location):
  113. # type: (Link, str) -> None
  114. vcs_backend = _get_used_vcs_backend(link)
  115. assert vcs_backend is not None
  116. vcs_backend.unpack(location, url=hide_url(link.url))
  117. def _get_used_vcs_backend(link):
  118. # type: (Link) -> Optional[VersionControl]
  119. """
  120. Return a VersionControl object or None.
  121. """
  122. for vcs_backend in vcs.backends:
  123. if link.scheme in vcs_backend.schemes:
  124. return vcs_backend
  125. return None
  126. def _progress_indicator(iterable, *args, **kwargs):
  127. return iterable
  128. def _download_url(
  129. resp, # type: Response
  130. link, # type: Link
  131. content_file, # type: IO
  132. hashes, # type: Optional[Hashes]
  133. progress_bar # type: str
  134. ):
  135. # type: (...) -> None
  136. try:
  137. total_length = int(resp.headers['content-length'])
  138. except (ValueError, KeyError, TypeError):
  139. total_length = 0
  140. cached_resp = getattr(resp, "from_cache", False)
  141. if logger.getEffectiveLevel() > logging.INFO:
  142. show_progress = False
  143. elif cached_resp:
  144. show_progress = False
  145. elif total_length > (40 * 1000):
  146. show_progress = True
  147. elif not total_length:
  148. show_progress = True
  149. else:
  150. show_progress = False
  151. show_url = link.show_url
  152. def resp_read(chunk_size):
  153. try:
  154. # Special case for urllib3.
  155. for chunk in resp.raw.stream(
  156. chunk_size,
  157. # We use decode_content=False here because we don't
  158. # want urllib3 to mess with the raw bytes we get
  159. # from the server. If we decompress inside of
  160. # urllib3 then we cannot verify the checksum
  161. # because the checksum will be of the compressed
  162. # file. This breakage will only occur if the
  163. # server adds a Content-Encoding header, which
  164. # depends on how the server was configured:
  165. # - Some servers will notice that the file isn't a
  166. # compressible file and will leave the file alone
  167. # and with an empty Content-Encoding
  168. # - Some servers will notice that the file is
  169. # already compressed and will leave the file
  170. # alone and will add a Content-Encoding: gzip
  171. # header
  172. # - Some servers won't notice anything at all and
  173. # will take a file that's already been compressed
  174. # and compress it again and set the
  175. # Content-Encoding: gzip header
  176. #
  177. # By setting this not to decode automatically we
  178. # hope to eliminate problems with the second case.
  179. decode_content=False):
  180. yield chunk
  181. except AttributeError:
  182. # Standard file-like object.
  183. while True:
  184. chunk = resp.raw.read(chunk_size)
  185. if not chunk:
  186. break
  187. yield chunk
  188. def written_chunks(chunks):
  189. for chunk in chunks:
  190. content_file.write(chunk)
  191. yield chunk
  192. progress_indicator = _progress_indicator
  193. if link.netloc == PyPI.netloc:
  194. url = show_url
  195. else:
  196. url = link.url_without_fragment
  197. if show_progress: # We don't show progress on cached responses
  198. progress_indicator = DownloadProgressProvider(progress_bar,
  199. max=total_length)
  200. if total_length:
  201. logger.info("Downloading %s (%s)", url, format_size(total_length))
  202. else:
  203. logger.info("Downloading %s", url)
  204. elif cached_resp:
  205. logger.info("Using cached %s", url)
  206. else:
  207. logger.info("Downloading %s", url)
  208. downloaded_chunks = written_chunks(
  209. progress_indicator(
  210. resp_read(CONTENT_CHUNK_SIZE),
  211. CONTENT_CHUNK_SIZE
  212. )
  213. )
  214. if hashes:
  215. hashes.check_against_chunks(downloaded_chunks)
  216. else:
  217. consume(downloaded_chunks)
  218. def _copy_file(filename, location, link):
  219. copy = True
  220. download_location = os.path.join(location, link.filename)
  221. if os.path.exists(download_location):
  222. response = ask_path_exists(
  223. 'The file %s exists. (i)gnore, (w)ipe, (b)ackup, (a)abort' %
  224. display_path(download_location), ('i', 'w', 'b', 'a'))
  225. if response == 'i':
  226. copy = False
  227. elif response == 'w':
  228. logger.warning('Deleting %s', display_path(download_location))
  229. os.remove(download_location)
  230. elif response == 'b':
  231. dest_file = backup_dir(download_location)
  232. logger.warning(
  233. 'Backing up %s to %s',
  234. display_path(download_location),
  235. display_path(dest_file),
  236. )
  237. shutil.move(download_location, dest_file)
  238. elif response == 'a':
  239. sys.exit(-1)
  240. if copy:
  241. shutil.copy(filename, download_location)
  242. logger.info('Saved %s', display_path(download_location))
  243. def unpack_http_url(
  244. link, # type: Link
  245. location, # type: str
  246. download_dir=None, # type: Optional[str]
  247. session=None, # type: Optional[PipSession]
  248. hashes=None, # type: Optional[Hashes]
  249. progress_bar="on" # type: str
  250. ):
  251. # type: (...) -> None
  252. if session is None:
  253. raise TypeError(
  254. "unpack_http_url() missing 1 required keyword argument: 'session'"
  255. )
  256. with TempDirectory(kind="unpack") as temp_dir:
  257. # If a download dir is specified, is the file already downloaded there?
  258. already_downloaded_path = None
  259. if download_dir:
  260. already_downloaded_path = _check_download_dir(link,
  261. download_dir,
  262. hashes)
  263. if already_downloaded_path:
  264. from_path = already_downloaded_path
  265. content_type = mimetypes.guess_type(from_path)[0]
  266. else:
  267. # let's download to a tmp dir
  268. from_path, content_type = _download_http_url(link,
  269. session,
  270. temp_dir.path,
  271. hashes,
  272. progress_bar)
  273. # unpack the archive to the build dir location. even when only
  274. # downloading archives, they have to be unpacked to parse dependencies
  275. unpack_file(from_path, location, content_type)
  276. # a download dir is specified; let's copy the archive there
  277. if download_dir and not already_downloaded_path:
  278. _copy_file(from_path, download_dir, link)
  279. if not already_downloaded_path:
  280. os.unlink(from_path)
  281. def _copy2_ignoring_special_files(src, dest):
  282. # type: (str, str) -> None
  283. """Copying special files is not supported, but as a convenience to users
  284. we skip errors copying them. This supports tools that may create e.g.
  285. socket files in the project source directory.
  286. """
  287. try:
  288. copy2_fixed(src, dest)
  289. except shutil.SpecialFileError as e:
  290. # SpecialFileError may be raised due to either the source or
  291. # destination. If the destination was the cause then we would actually
  292. # care, but since the destination directory is deleted prior to
  293. # copy we ignore all of them assuming it is caused by the source.
  294. logger.warning(
  295. "Ignoring special file error '%s' encountered copying %s to %s.",
  296. str(e),
  297. path_to_display(src),
  298. path_to_display(dest),
  299. )
  300. def _copy_source_tree(source, target):
  301. # type: (str, str) -> None
  302. def ignore(d, names):
  303. # Pulling in those directories can potentially be very slow,
  304. # exclude the following directories if they appear in the top
  305. # level dir (and only it).
  306. # See discussion at https://github.com/pypa/pip/pull/6770
  307. return ['.tox', '.nox'] if d == source else []
  308. kwargs = dict(ignore=ignore, symlinks=True) # type: CopytreeKwargs
  309. if not PY2:
  310. # Python 2 does not support copy_function, so we only ignore
  311. # errors on special file copy in Python 3.
  312. kwargs['copy_function'] = _copy2_ignoring_special_files
  313. shutil.copytree(source, target, **kwargs)
  314. def unpack_file_url(
  315. link, # type: Link
  316. location, # type: str
  317. download_dir=None, # type: Optional[str]
  318. hashes=None # type: Optional[Hashes]
  319. ):
  320. # type: (...) -> None
  321. """Unpack link into location.
  322. If download_dir is provided and link points to a file, make a copy
  323. of the link file inside download_dir.
  324. """
  325. link_path = link.file_path
  326. # If it's a url to a local directory
  327. if link.is_existing_dir():
  328. if os.path.isdir(location):
  329. rmtree(location)
  330. _copy_source_tree(link_path, location)
  331. if download_dir:
  332. logger.info('Link is a directory, ignoring download_dir')
  333. return
  334. # If --require-hashes is off, `hashes` is either empty, the
  335. # link's embedded hash, or MissingHashes; it is required to
  336. # match. If --require-hashes is on, we are satisfied by any
  337. # hash in `hashes` matching: a URL-based or an option-based
  338. # one; no internet-sourced hash will be in `hashes`.
  339. if hashes:
  340. hashes.check_against_path(link_path)
  341. # If a download dir is specified, is the file already there and valid?
  342. already_downloaded_path = None
  343. if download_dir:
  344. already_downloaded_path = _check_download_dir(link,
  345. download_dir,
  346. hashes)
  347. if already_downloaded_path:
  348. from_path = already_downloaded_path
  349. else:
  350. from_path = link_path
  351. content_type = mimetypes.guess_type(from_path)[0]
  352. # unpack the archive to the build dir location. even when only downloading
  353. # archives, they have to be unpacked to parse dependencies
  354. unpack_file(from_path, location, content_type)
  355. # a download dir is specified and not already downloaded
  356. if download_dir and not already_downloaded_path:
  357. _copy_file(from_path, download_dir, link)
  358. def unpack_url(
  359. link, # type: Link
  360. location, # type: str
  361. download_dir=None, # type: Optional[str]
  362. session=None, # type: Optional[PipSession]
  363. hashes=None, # type: Optional[Hashes]
  364. progress_bar="on" # type: str
  365. ):
  366. # type: (...) -> None
  367. """Unpack link.
  368. If link is a VCS link:
  369. if only_download, export into download_dir and ignore location
  370. else unpack into location
  371. for other types of link:
  372. - unpack into location
  373. - if download_dir, copy the file into download_dir
  374. - if only_download, mark location for deletion
  375. :param hashes: A Hashes object, one of whose embedded hashes must match,
  376. or HashMismatch will be raised. If the Hashes is empty, no matches are
  377. required, and unhashable types of requirements (like VCS ones, which
  378. would ordinarily raise HashUnsupported) are allowed.
  379. """
  380. # non-editable vcs urls
  381. if link.is_vcs:
  382. unpack_vcs_link(link, location)
  383. # file urls
  384. elif link.is_file:
  385. unpack_file_url(link, location, download_dir, hashes=hashes)
  386. # http urls
  387. else:
  388. if session is None:
  389. session = PipSession()
  390. unpack_http_url(
  391. link,
  392. location,
  393. download_dir,
  394. session,
  395. hashes=hashes,
  396. progress_bar=progress_bar
  397. )
  398. def sanitize_content_filename(filename):
  399. # type: (str) -> str
  400. """
  401. Sanitize the "filename" value from a Content-Disposition header.
  402. """
  403. return os.path.basename(filename)
  404. def parse_content_disposition(content_disposition, default_filename):
  405. # type: (str, str) -> str
  406. """
  407. Parse the "filename" value from a Content-Disposition header, and
  408. return the default filename if the result is empty.
  409. """
  410. _type, params = cgi.parse_header(content_disposition)
  411. filename = params.get('filename')
  412. if filename:
  413. # We need to sanitize the filename to prevent directory traversal
  414. # in case the filename contains ".." path parts.
  415. filename = sanitize_content_filename(filename)
  416. return filename or default_filename
  417. def _download_http_url(
  418. link, # type: Link
  419. session, # type: PipSession
  420. temp_dir, # type: str
  421. hashes, # type: Optional[Hashes]
  422. progress_bar # type: str
  423. ):
  424. # type: (...) -> Tuple[str, str]
  425. """Download link url into temp_dir using provided session"""
  426. target_url = link.url.split('#', 1)[0]
  427. try:
  428. resp = session.get(
  429. target_url,
  430. # We use Accept-Encoding: identity here because requests
  431. # defaults to accepting compressed responses. This breaks in
  432. # a variety of ways depending on how the server is configured.
  433. # - Some servers will notice that the file isn't a compressible
  434. # file and will leave the file alone and with an empty
  435. # Content-Encoding
  436. # - Some servers will notice that the file is already
  437. # compressed and will leave the file alone and will add a
  438. # Content-Encoding: gzip header
  439. # - Some servers won't notice anything at all and will take
  440. # a file that's already been compressed and compress it again
  441. # and set the Content-Encoding: gzip header
  442. # By setting this to request only the identity encoding We're
  443. # hoping to eliminate the third case. Hopefully there does not
  444. # exist a server which when given a file will notice it is
  445. # already compressed and that you're not asking for a
  446. # compressed file and will then decompress it before sending
  447. # because if that's the case I don't think it'll ever be
  448. # possible to make this work.
  449. headers={"Accept-Encoding": "identity"},
  450. stream=True,
  451. )
  452. resp.raise_for_status()
  453. except requests.HTTPError as exc:
  454. logger.critical(
  455. "HTTP error %s while getting %s", exc.response.status_code, link,
  456. )
  457. raise
  458. content_type = resp.headers.get('content-type', '')
  459. filename = link.filename # fallback
  460. # Have a look at the Content-Disposition header for a better guess
  461. content_disposition = resp.headers.get('content-disposition')
  462. if content_disposition:
  463. filename = parse_content_disposition(content_disposition, filename)
  464. ext = splitext(filename)[1] # type: Optional[str]
  465. if not ext:
  466. ext = mimetypes.guess_extension(content_type)
  467. if ext:
  468. filename += ext
  469. if not ext and link.url != resp.url:
  470. ext = os.path.splitext(resp.url)[1]
  471. if ext:
  472. filename += ext
  473. file_path = os.path.join(temp_dir, filename)
  474. with open(file_path, 'wb') as content_file:
  475. _download_url(resp, link, content_file, hashes, progress_bar)
  476. return file_path, content_type
  477. def _check_download_dir(link, download_dir, hashes):
  478. # type: (Link, str, Optional[Hashes]) -> Optional[str]
  479. """ Check download_dir for previously downloaded file with correct hash
  480. If a correct file is found return its path else None
  481. """
  482. download_path = os.path.join(download_dir, link.filename)
  483. if not os.path.exists(download_path):
  484. return None
  485. # If already downloaded, does its hash match?
  486. logger.info('File was already downloaded %s', download_path)
  487. if hashes:
  488. try:
  489. hashes.check_against_path(download_path)
  490. except HashMismatch:
  491. logger.warning(
  492. 'Previously-downloaded file %s has bad hash. '
  493. 'Re-downloading.',
  494. download_path
  495. )
  496. os.unlink(download_path)
  497. return None
  498. return download_path