xCodeEval Not Found ...validation/C%2523.jsonl的原因与解决
原blog地址: poilzero.cn/index.php/archives/335/
作者: p0iL
原因
URL中的C%2523.jsonl实际上是C#.jsonl的双重编码,因为%25是%的URL编码,所以%2523解码后是%23,而%23又是#的编码。因此,正确的文件名应该是C#.jsonl,但程序错误地将其编码了两次,导致路径错误。
这是怎么造成的呢?问题出在 双重 URL 编码,具体来说是在生成文件 URL 时手动调用了 urllib.parse.quote,而 Hugging Face 的 datasets 库在下载文件时会自动对路径进行 URL 编码。这导致文件名中的 # 被错误编码为 %2523(正确应为 %23)。
问题定位
xCodeEval数据集是在22年上传的,但是处理数据集的脚本代码xCodeEval.py在24年更新过一次导致的问题。
错误代码位于xCodeEval.py->def _split_generators(self, dl_manager):
中的错误逻辑。
# 在以下代码段中,手动调用了 urllib.parse.quote 导致双重编码:
train_urls = [
BASE_URL.format(
task_name=task_name,
split="train",
file_name=urllib.parse.quote(file_name), # 这里多余
)
for file_name in TRAIN_FILE_NAMES
]
解决方案
更正后的代码如下:
def _split_generators(self, dl_manager):
task_name = self.config.name
if task_name == "retrieval_corpus":
TEST_FILE_NAMES = get_file_name(task_name, "")
test_urls = [
BASE_URL.format(
task_name=task_name,
split="",
# file_name=urllib.parse.quote(file_name),
file_name=file_name,
).replace("s//", "s/")
for file_name in TEST_FILE_NAMES
]
# test_urls = [ BASE_URL.format(task_name=task_name, split="", file_name=file_name) for file_name in TEST_FILE_NAMES]
test_downloaded_files = dl_manager.download(test_urls)
# test_downloaded_files = test_urls
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepaths": test_downloaded_files,
},
),
]
TRAIN_FILE_NAMES = get_file_name(task_name, "train")
VALIDATION_FILE_NAMES = get_file_name(task_name, "validation")
TEST_FILE_NAMES = get_file_name(task_name, "test")
train_urls = [
BASE_URL.format(
task_name=task_name,
split="train",
# file_name=urllib.parse.quote(file_name),
file_name=file_name,
)
for file_name in TRAIN_FILE_NAMES
]
validation_urls = [
BASE_URL.format(
task_name=task_name,
split="validation",
# file_name=urllib.parse.quote(file_name),
file_name=file_name,
)
for file_name in VALIDATION_FILE_NAMES
]
test_urls = [
BASE_URL.format(
task_name=task_name,
split="test",
# file_name=urllib.parse.quote(file_name),
file_name=file_name,
)
for file_name in TEST_FILE_NAMES
]
# train_urls = [ BASE_URL.format(task_name=task_name, split="train", file_name=file_name) for file_name in TRAIN_FILE_NAMES]
# validation_urls = [ BASE_URL.format(task_name=task_name, split="validation", file_name=file_name) for file_name in VALIDATION_FILE_NAMES]
# test_urls = [ BASE_URL.format(task_name=task_name, split="test", file_name=file_name) for file_name in TEST_FILE_NAMES]
train_downloaded_files = dl_manager.download(train_urls)
validation_downloaded_files = dl_manager.download(validation_urls)
test_downloaded_files = dl_manager.download(test_urls)
# train_downloaded_files = train_urls
# validation_downloaded_files = validation_urls
# test_downloaded_files = test_urls
if task_name == "code_translation":
VALIDATION_SMALL_FILE_NAMES = get_file_name(task_name, "validation_small")
validation_small_urls = [
BASE_URL.format(
task_name=task_name,
split="validation_small",
# file_name=urllib.parse.quote(file_name),
file_name=file_name,
)
for file_name in VALIDATION_SMALL_FILE_NAMES
]
validation_small_downloaded_files = dl_manager.download(
validation_small_urls
)
# validation_small_urls = [ BASE_URL.format(task_name=task_name, split="validation_small", file_name=file_name) for file_name in VALIDATION_SMALL_FILE_NAMES]
# validation_small_downloaded_files = validation_small_urls
prob_desc_file, unit_test_db_file = None, None
if task_name in _PROBLEM_DESC_REQ_TASK:
prob_desc_file = dl_manager.download(PROBLEM_DESC_URL)
if task_name in _UNIT_TEST_REQ_TASK:
unit_test_db_file = dl_manager.download(UNIT_TEST_DB_URL)
split_info = [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepaths": train_downloaded_files,
"problem_description_file": prob_desc_file,
},
),
datasets.SplitGenerator(
name='compact',
gen_kwargs={
"filepaths": validation_downloaded_files,
"problem_description_file": prob_desc_file,
"unit_test_db_file": unit_test_db_file,
},
),
datasets.SplitGenerator(
name='titan',
gen_kwargs={
"filepaths": test_downloaded_files,
"problem_description_file": prob_desc_file,
"unit_test_db_file": unit_test_db_file,
},
),
]
if task_name == "code_translation":
split_info.append(
datasets.SplitGenerator(
name="compact_small",
gen_kwargs={
"filepaths": validation_small_downloaded_files,
"problem_description_file": prob_desc_file,
"unit_test_db_file": unit_test_db_file,
},
),
)
return split_info
split 编码错误
问题及定位
特殊字符,起源同上
File "...cache\huggingface\modules\datasets_modules\datasets\NTU-NLP-sg--xCodeEval\09d865c42c6ae48662c010d4be4fbe4322b30a51b96c25b1c13d6bc479a03d0e\xCodeEval.py", line 2483, in _generate_examples
for line in rp:
^^
UnicodeDecodeError: 'gbk' codec can't decode byte 0x94 in position 704: illegal multibyte sequence
解决方案
错误代码位于xCodeEval.py-> def _generate_examples(self, filepaths, problem_description_file=None, unit_test_db_file=None):
中。
修改代码如下
def _generate_examples(
self, filepaths, problem_description_file=None, unit_test_db_file=None
):
"""This function returns the examples"""
task_name = self.config.name
problem_descriptions = None
if problem_description_file is not None:
problem_descriptions = {}
with open(problem_description_file, encoding="utf-8") as rp:
for line in rp:
prob_desc = json.loads(line)
src_uid = prob_desc["src_uid"]
problem_descriptions[src_uid] = prob_desc
unit_test_db = None
if unit_test_db_file is not None:
with open(unit_test_db_file, encoding="utf-8") as rp:
unit_test_db = json.load(rp)
idx = 0
for filepath in filepaths:
file_name = filepath.split("/")[-1]
with open(filepath, encoding="utf-8") as rp:
for line in rp:
sample = json.loads(line)
sample["file_name"] = file_name
for pre_feature in list(sample.keys()):
if pre_feature not in _TEXT_FEATURES[task_name]:
sample.pop(pre_feature)
for feature in _TEXT_FEATURES[task_name]:
if feature not in sample:
sample[feature] = ""
if task_name in _UNIT_TEST_REQ_TASK:
sample["hidden_unit_tests"] = ""
if (
task_name not in _PROBLEM_DESC_REQ_TASK
or problem_descriptions is None
):
yield idx, sample # if problem_description_file is None then unit_test_db_file should be None
idx += 1
continue
src_uid = sample["src_uid"]
# if problem_description_file is not None, the sample has `src_uid`
prob_desc = problem_descriptions[src_uid]
for key, ckey in _PROB_DESC_TEXT_FEATURES.items():
if (
ckey == "prob_desc_sample_inputs"
or ckey == "prob_desc_sample_outputs"
):
sample[ckey] = json.dumps(prob_desc[key])
else:
sample[ckey] = prob_desc[key]
if task_name not in _UNIT_TEST_REQ_TASK or unit_test_db is None:
yield idx, sample
idx += 1
continue
sample["hidden_unit_tests"] = json.dumps(unit_test_db[src_uid])
yield idx, sample
idx += 1