private string ReadPpf()
{
string fn = @"E:\PDFReaderTest\article\C#从入门到精通.pdf";
PdfReader p = new PdfReader(fn);
//从每一页读出的字符串
string str = System.String.Empty;
//"[......]"内部字符串
string subStr = System.String.Empty;
//函数返回的字符串
string rtStr = System.String.Empty;
//从每一页读出的8位字节数组
byte[] b = new byte[0];
//"[","]","(",")"在字符串中的位置
Int32 bg = 0, ed = 0, subbg = 0, subed = 0;
//取得文档总页数
int pg = p.NumberOfPages;
System.Text.StringBuilder sb = new System.Text.StringBuilder();
for (int i = 1; i <= pg; i++)
{
bg = 0;
ed = 0;
Array.Resize(ref b, 0);
//取得第i页的内容
b = p.GetPageContent(i);
//下一行是把每一页的取得的字节数据写入一个txt的文件,仅供研究时用
System.IO.File.WriteAllBytes(@"E:\PDFReaderTest\article\xct.txt", b);
//取得每一页的字节数组,将每一个字节转换为字符,并将数组转换为字符串
for (int j = 0; j < b.Length; j++)
{
sb.Append(Convert.ToChar(b[j]));
}
str = sb.ToString() ;
}
return str;
//System.Text.StringBuilder text = new System.Text.StringBuilder();
//string fileName = @"E:\PDFReaderTest\article\xct.pdf";
//if (File.Exists(fileName))
//{
// PdfReader pdfReader = new PdfReader(fileName);
// for (int page = 1; page <= pdfReader.NumberOfPages; page++)
// {
// ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
// string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
// currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
// text.Append(currentText);
// }
// pdfReader.Close();
//}
//return text.ToString();
//string fileName = @"E:\PDFReaderTest\article\xct.pdf";
//PdfReader reader = new PdfReader(file);
//string text = PdfTextExtractor.GetTextFromPage(reader, 1);
//try { reader.Close(); }
//catch { }
//return text;
//try
//{
// string pdffilename = @"E:\PDFReaderTest\article\未命名.pdf";
// PdfReader pdfReader = new PdfReader(pdffilename);
// int numberOfPages = pdfReader.NumberOfPages;
// string text = string.Empty;
// for (int i = 1; i <= numberOfPages; ++i)
// {
// byte[] bufferOfPageContent = pdfReader.GetPageContent(i);
// text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent);
// }
// pdfReader.Close();
// return text;
//}
//catch (Exception ex)
//{
// return null;
//}
}
试试将
System.IO.File.WriteAllBytes(@"E:\PDFReaderTest\article\xct.txt", b);
改为
System.IO.File.WriteAllText(@"E:\PDFReaderTest\article\xct.txt", System.Text.Encoding.UTF8.GetString(b));
不可以,还是乱码
@-Ada-: 试试这里的方法:抽取PDF文本
应该是pdf的字符编码与当前默认的编码不一致
楼主解决了吗?我也遇到同样的问题。。