我正在尝试运行一个程序,该程序应该扫描“新文件夹”目录中的所有 pdf 文件,提取相关的字符串值并在新计算机中生成一个表。
代码如下
def check_rate(rating):
Rating=rating.upper()
U="Unsafe"
NR="Needs Rectification"
II="Improvements identified"
A="Adequate"
if Rating[:2]=="1H":
return U
elif Rating[:2]=="2H":
return NR
elif Rating[:2]=="2M":
return II
elif Rating[:2]=="2L":
return A
elif Rating[:2]=="3L":
return A
elif "UNSAFE" in Rating:
return U
elif "NEEDS RECTIFICATION" in Rating:
return NR
elif "IMPROVEMENTS IDENTIFIED" in Rating:
return II
elif "ADEQUATE" in Rating:
return A
import glob
import pandas as pd
files=glob.glob("./New folder/*.pdf")
df_name=pd.DataFrame(files,columns=['FileName'])
x=0
y=len(df_name)
df_name["SV_ID"]=""
while x < y:
i1=df_name["FileName"][x][22]
i2=df_name["FileName"][x][23]
i3=df_name["FileName"][x][24]
if i1==" " or i1=="-" or i1==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:22]
elif i2==" " or i2=="-" or i2==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:23]
elif i3==" " or i3=="-" or i3==".":
df_name.at[x,"SV_ID"]=df_name["FileName"][x][13:24]
else:
df_name.at[x,"SV_ID"]="N/A"
x+=1
df_name.to_csv('name2.csv')
df_ref=pd.read_csv('CheckListItems.csv')
df_Rate=pd.read_csv('TechSafe.csv')
from tika import parser
line=0
n1=0
while n1<y:
rawText = parser.from_file(df_name['FileName'][n1])
rawList = rawText['content'].splitlines()
n=0
long=len(rawList)
version=''
while n<long:
word=rawList[n].strip().upper()
if word[:27]=='SOLAR VICTORIA AUDIT REPORT':
version=word[43:]
if word[-1:]==".":
word=word[:-1]
if word in list(df_ref["Reference"]):
add=1
while add<45 and n+add<long:
wordcheck=rawList[n+add].strip()
if wordcheck[-1:]==".":
wordcheck=wordcheck[:-1]
if wordcheck in list(df_ref["Reference"]):
add=45
if wordcheck[:8] == "Rating -":
df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
df_Rate.at[line,'Ref']=word
df_Rate.at[line,'Rate']=check_rate(rawList[n+add+1].strip())
df_Rate.at[line,'Version']=version
add=45
line+=1
# print(df_name['SV_ID'][n1],wordcheck)
elif wordcheck[:7] == "Rating ":
df_Rate.at[line,'SV_ID']=df_name['SV_ID'][n1]
df_Rate.at[line,'Ref']=word
wordcheck=wordcheck[7:]
df_Rate.at[line,'Rate']=check_rate(wordcheck)
df_Rate.at[line,'Version']=version
add=45
line+=1
# print(df_name['SV_ID'][n1],wordcheck)
# Old process
line+=1
add+=1
n+=1
n1+=1
print(n1*100//y,"%")
df_Rate.to_csv('1.1 Finding Draft.csv')
df_Rate.tail()
这应该产生一个如下表
SV_ID | 参考 | 速度 | 版本 |
---|---|---|---|
INS00102811 | PCE 23 | 确定的改进 | V2.4C |
我认为问题不在于 pdf 文件或脚本的主体。我认为问题主要在于包裹。这段代码在我朋友的电脑上运行良好,在我面前实时运行,但在我的电脑上不起作用。我们可能有不同版本的 python,我们在 jupyter notebook 上运行。
我收到以下错误:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
46 x+=1
47 df_name.to_csv('name2.csv')
---> 48 df_ref=pd.read_csv('CheckListItems.csv')
49 df_Rate=pd.read_csv('TechSafe.csv')
50
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
684 )
685
--> 686 return _read(filepath_or_buffer, kwds)
687
688
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
450
451 # Create the parser.
--> 452 parser = TextFileReader(fp_or_buf, **kwds)
453
454 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
944 self.options["has_index_names"] = kwds["has_index_names"]
945
--> 946 self._make_engine(self.engine)
947
948 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1176 def _make_engine(self, engine="c"):
1177 if engine == "c":
-> 1178 self._engine = CParserWrapper(self.f, **self.options)
1179 else:
1180 if engine == "python":
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
2006 kwds["usecols"] = self.usecols
2007
-> 2008 self._reader = parsers.TextReader(src, **kwds)
2009 self.unnamed_cols = self._reader.unnamed_cols
2010
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] No such file or directory: 'CheckListItems.csv'
我创建了一个名为“CheckListItels.csv”的空文件。
现在我显示以下错误:
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-2-8f6d401ba180> in <module>
49 df_Rate=pd.read_csv('TechSafe.csv')
50
---> 51 from tika import parser
52
53 line=0
ModuleNotFoundError: No module named 'tika'
假设是包安装问题,我尝试安装 glob
(base) C:\>pip install glob2
Requirement already satisfied: glob2 in c:\programdata\anaconda3\lib\site-packages (0.7)
我使用的是 Python 3.8.5 版。
我不确定如何运行此代码。请帮忙。谢谢你。
感谢您的友好建议。根据建议,我从 anacondas gui 安装了 tika。我发现的错误如下:
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2894 try:
-> 2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Reference'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-1-8f6d401ba180> in <module>
67 if word[-1:]==".":
68 word=word[:-1]
---> 69 if word in list(df_ref["Reference"]):
70 add=1
71 while add<45 and n+add<long:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
-> 2897 raise KeyError(key) from err
2898
2899 if tolerance is not None:
KeyError: 'Reference'
您收到此错误的可能性是 99%,因为该文件不在您的代码所说的位置 - 抱歉!
线路:
df_ref=pd.read_csv('CheckListItems.csv')
是 CheckListItems.csv 文件的相对路径。所以这里是我将如何解决它:
df_ref=pd.read_csv('/home/drislam/documents/python/CheckListItems.csv')
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句