From 1bbbc339a81d8d175d18fa96e45bd3303ef69408 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sat, 11 Nov 2023 10:55:43 -0700 Subject: [PATCH] better error handling for tile downloads, add download retry arg --- exfiltrate.py | 77 +++++++++++++++++++++++++++++---------------------- pkg/thread.py | 16 ++++++++--- 2 files changed, 56 insertions(+), 37 deletions(-) diff --git a/exfiltrate.py b/exfiltrate.py index 62a4ced..a1ca3b1 100644 --- a/exfiltrate.py +++ b/exfiltrate.py @@ -29,8 +29,13 @@ if __name__ == '__main__': parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff') parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)') parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.") + parser.add_argument('--download-loops', default=1, type=int, help='Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1') args = parser.parse_args() + if args.download_loops <= 0: + print('--download-loops must be greater than 0') + quit(1) + args.base_url = args.base_url.strip('/') + f'/{args.zoom}/' base_output = Path(args.output).resolve().absolute().expanduser() url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==') @@ -54,39 +59,45 @@ if __name__ == '__main__': if args.referer: r_headers['Referer'] = args.referer - tiles = [] - retries = [] - total_downloaded = 0 - row_i = min_row - row_iter = range(min_row, max_row + 1) - row_bar = tqdm(total=len(row_iter), desc=f'Row {row_i}', postfix={'new_files': total_downloaded, 'failures': len(retries)}) - for row in row_iter: - row_i = row - col_iter = range(min_col, max_col + 1) - if args.no_download: - for col in col_iter: - tiles.append((row, col)) - else: - col_bar = tqdm(total=len(col_iter), leave=False) - with (ThreadPoolExecutor(args.dl_threads) as executor): - futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter] - for future in as_completed(futures): - result = future.result() - if result: - result_row, result_col, new_image = result - if new_image == 'success': - total_downloaded += 1 - tiles.append((result_row, result_col)) - elif new_image == 'exist': - tiles.append((result_row, result_col)) - elif new_image == 'failure': - retries.append((result_row, result_col)) - row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) - col_bar.update() - row_bar.refresh() - col_bar.close() - row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) - row_bar.update() + row_bar = tqdm(total=0, desc='Row 000 | Loop 0/0', postfix={'new_files': 0, 'failures': 0}) + for i in range(1, args.download_loops + 1): + row_bar.reset() + tiles = [] + retries = [] + total_downloaded = 0 + row_i = min_row + row_iter = range(min_row, max_row + 1) + row_bar.total = len(row_iter) + row_bar.desc = f'Row {row_i} | Loop {i}/{args.download_loops}' + row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) + for row in row_iter: + row_i = row + col_iter = range(min_col, max_col + 1) + if args.no_download: + for col in col_iter: + tiles.append((row, col)) + else: + col_bar = tqdm(total=len(col_iter), leave=False) + with (ThreadPoolExecutor(args.dl_threads) as executor): + futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter] + for future in as_completed(futures): + result = future.result() + if result: + result_row, result_col, new_image = result + if new_image == 'success': + total_downloaded += 1 + tiles.append((result_row, result_col)) + elif new_image == 'exist': + tiles.append((result_row, result_col)) + elif new_image == 'failure': + retries.append((result_row, result_col)) + row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) + col_bar.update() + row_bar.refresh() + col_bar.close() + row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) + row_bar.update() + row_bar.close() col_bar = tqdm(total=len(retries), desc=f'Tile Retries') diff --git a/pkg/thread.py b/pkg/thread.py index 23560c3..76468bf 100644 --- a/pkg/thread.py +++ b/pkg/thread.py @@ -17,13 +17,15 @@ def del_path(p: Path): def download_tile(task): row, col, base_url, r_headers, output, use_proxy = task + corrupted_image = False try: output_path: Path = output / f"{row}_{col}.png" if output_path.exists(): if not is_png(output_path): - # Delete the file and try again. - del_path(output_path) - tqdm.write(f'cannot identify image file: "{output_path}", deleting and retrying...') + # We will re-download the image. Don't need to delete it, just overwrite it. + # del_path(output_path) + corrupted_image = True + tqdm.write(f'Cannot identify image file: "{output_path}", deleting and retrying...') else: return row, col, 'exist' tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://') @@ -33,9 +35,15 @@ def download_tile(task): raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}') with open(output_path, "wb") as f: f.write(response.content) - return row, col, 'success' + # Recheck the PNG if it was corrupted. + if corrupted_image and not is_png(output_path): + print(f"Retry for {row}_{col} failed a second time: cannot identify image file") + return row, col, 'failure' + else: + return row, col, 'success' else: print(f"Failed to download tile {row}_{col}") + return row, col, 'failure' except Exception as e: # traceback.print_exc() tqdm.write(f'Exception on {(row, col)} - {e.__class__.__name__}: {e}')