better error handling for tile downloads, add download retry arg

This commit is contained in:
Cyberes 2023-11-11 10:55:43 -07:00
parent 1006fc7d49
commit 1bbbc339a8
2 changed files with 56 additions and 37 deletions

View File

@ -29,8 +29,13 @@ if __name__ == '__main__':
parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff') parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff')
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)') parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.") parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.")
parser.add_argument('--download-loops', default=1, type=int, help='Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1')
args = parser.parse_args() args = parser.parse_args()
if args.download_loops <= 0:
print('--download-loops must be greater than 0')
quit(1)
args.base_url = args.base_url.strip('/') + f'/{args.zoom}/' args.base_url = args.base_url.strip('/') + f'/{args.zoom}/'
base_output = Path(args.output).resolve().absolute().expanduser() base_output = Path(args.output).resolve().absolute().expanduser()
url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==') url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==')
@ -54,39 +59,45 @@ if __name__ == '__main__':
if args.referer: if args.referer:
r_headers['Referer'] = args.referer r_headers['Referer'] = args.referer
tiles = [] row_bar = tqdm(total=0, desc='Row 000 | Loop 0/0', postfix={'new_files': 0, 'failures': 0})
retries = [] for i in range(1, args.download_loops + 1):
total_downloaded = 0 row_bar.reset()
row_i = min_row tiles = []
row_iter = range(min_row, max_row + 1) retries = []
row_bar = tqdm(total=len(row_iter), desc=f'Row {row_i}', postfix={'new_files': total_downloaded, 'failures': len(retries)}) total_downloaded = 0
for row in row_iter: row_i = min_row
row_i = row row_iter = range(min_row, max_row + 1)
col_iter = range(min_col, max_col + 1) row_bar.total = len(row_iter)
if args.no_download: row_bar.desc = f'Row {row_i} | Loop {i}/{args.download_loops}'
for col in col_iter: row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
tiles.append((row, col)) for row in row_iter:
else: row_i = row
col_bar = tqdm(total=len(col_iter), leave=False) col_iter = range(min_col, max_col + 1)
with (ThreadPoolExecutor(args.dl_threads) as executor): if args.no_download:
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter] for col in col_iter:
for future in as_completed(futures): tiles.append((row, col))
result = future.result() else:
if result: col_bar = tqdm(total=len(col_iter), leave=False)
result_row, result_col, new_image = result with (ThreadPoolExecutor(args.dl_threads) as executor):
if new_image == 'success': futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter]
total_downloaded += 1 for future in as_completed(futures):
tiles.append((result_row, result_col)) result = future.result()
elif new_image == 'exist': if result:
tiles.append((result_row, result_col)) result_row, result_col, new_image = result
elif new_image == 'failure': if new_image == 'success':
retries.append((result_row, result_col)) total_downloaded += 1
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) tiles.append((result_row, result_col))
col_bar.update() elif new_image == 'exist':
row_bar.refresh() tiles.append((result_row, result_col))
col_bar.close() elif new_image == 'failure':
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)}) retries.append((result_row, result_col))
row_bar.update() row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
col_bar.update()
row_bar.refresh()
col_bar.close()
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
row_bar.update()
row_bar.close() row_bar.close()
col_bar = tqdm(total=len(retries), desc=f'Tile Retries') col_bar = tqdm(total=len(retries), desc=f'Tile Retries')

View File

@ -17,13 +17,15 @@ def del_path(p: Path):
def download_tile(task): def download_tile(task):
row, col, base_url, r_headers, output, use_proxy = task row, col, base_url, r_headers, output, use_proxy = task
corrupted_image = False
try: try:
output_path: Path = output / f"{row}_{col}.png" output_path: Path = output / f"{row}_{col}.png"
if output_path.exists(): if output_path.exists():
if not is_png(output_path): if not is_png(output_path):
# Delete the file and try again. # We will re-download the image. Don't need to delete it, just overwrite it.
del_path(output_path) # del_path(output_path)
tqdm.write(f'cannot identify image file: "{output_path}", deleting and retrying...') corrupted_image = True
tqdm.write(f'Cannot identify image file: "{output_path}", deleting and retrying...')
else: else:
return row, col, 'exist' return row, col, 'exist'
tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://') tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://')
@ -33,9 +35,15 @@ def download_tile(task):
raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}') raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}')
with open(output_path, "wb") as f: with open(output_path, "wb") as f:
f.write(response.content) f.write(response.content)
return row, col, 'success' # Recheck the PNG if it was corrupted.
if corrupted_image and not is_png(output_path):
print(f"Retry for {row}_{col} failed a second time: cannot identify image file")
return row, col, 'failure'
else:
return row, col, 'success'
else: else:
print(f"Failed to download tile {row}_{col}") print(f"Failed to download tile {row}_{col}")
return row, col, 'failure'
except Exception as e: except Exception as e:
# traceback.print_exc() # traceback.print_exc()
tqdm.write(f'Exception on {(row, col)} - {e.__class__.__name__}: {e}') tqdm.write(f'Exception on {(row, col)} - {e.__class__.__name__}: {e}')