下面的coresearch 也就是no 这段代码有问题,求助大佬解决!
最下面有原题! 中文,英文版本都有
int exactSearch(int n);
int topSearch(int n);
int topKSearch(int n);
int maxnum(int * pointer);
int max(const void a, const void b);
char * s_gets(char st, int n);
char fs_gets(char st, int n, FILE fp);
int compare(char p1, char p2);
int Num0fstr(char Mstr, char substr);
char flag[30] = "<<CONTENT>>";
char input[LIM][SIZE];
int main()
{
char choice1[SIZE], choice2[SIZE], c, ch = 'y';
char * test;
test = (char )malloc(100 sizeof(char));
int n = 0;
int ct = 0, cts = 0, total;
int i = 0, querynum = 0, filect;
char filename[30];
char * ptstr[LIM];
char queryin[LIM][SIZE];
_Bool bool = 0;
char * txt;
txt = (char )malloc(102400 sizeof(char)); //每一个文档的大小
//int process=0;
FILE * fp, output, query;
while (ch == 'y')
{
printf("Do you need any special service? Such as 'exactSearch', 'topSearch', 'topKSearch'.Or input 'no'.\n");
scanf("%s", choice1);
if (strcmp(choice1, A) == 0) //这是exact
{
printf("Manual or Script?\n");
scanf("%s", choice2);
if (strcmp(choice2, X) == 0) n = 1;
exactSearch(n);
}
else if (strcmp(choice1, B) == 0) //这是top
{
printf("Manual or Script?\n");
scanf("%s", choice2);
if (strcmp(choice2, X) == 0) n = 1;
printf("%d", n);
topSearch(n);
}
else if (strcmp(choice1, C) == 0) //这是topK
{
printf("Manual or Script?\n");
scanf("%s", choice2);
if (strcmp(choice2, X) == 0) n = 1;
topKSearch(n);
}
else if (strcmp(choice1, D) == 0) //这是no
{
printf("Manual or Script?\n");
scanf("%s", choice2);
getchar();
output = fopen("output.txt", "w");
if (strcmp(choice2, X) == 0) n = 1;
//这是手动输入关键词
if (n == 1)
{
printf("Now please input your query term,separated by space and confirm by enter key.NO more than %d\n", LIM);
while (ct < LIM&&fgets(input[ct], SIZE, stdin) != NULL && input[ct][0] != '\n') //确定用户输入正确
{
input[ct][strlen(input[ct]) - 1] = '\0';
ct++;
}
for (i = 0; i < ct; i++)//ct即count用户输入关键词的数量
{
char t[2] = " ";
strcat(input[i], t);
strcat(t, input[i]);
strcpy(input[i], t);
}
for (i = 0, filect = 1; i < total; i++, filect++)//搜索文件
{
cts = 0;
if (i < 9) sprintf(filename, "00%d.txt", filect);
if (i < 99 && i>10) sprintf(filename, "0%d.txt", filect);
if (i > 100 && i < total) sprintf(filename, "%d.txt", filect);//有什么好的优化方法?
if ((fp = fopen(filename, "r")) == NULL)
printf("ERROR filename...\n");
system("PAUSE");
while ((fscanf(fp, "%s", test) != EOF) && (compare(test, flag) != 0));
while (1)
{
c = fgetc(fp);
if (c == EOF)
break;
txt[cts] = c;
cts++;
}//把文档里面的字符读到数组
for (querynum = 0; querynum < ct; querynum++)//????
{
if (Num0fstr(txt, input[querynum]) != 0)//计算关键词的个数
fprintf(output, "%d %d\n", querynum, filect);
//printf("%d %d\n",querynum,filect);
}
//system("PAUSE");
while (txt[cts] != '\0')
{
txt[cts] = '\0'; cts++;
}//清空字符串
fclose(fp);
printf("%d\n", filect);
}
fclose(output);
}
else //这是文件输入关键词
{
if ((query = fopen("query.txt", "r")) == NULL)
printf("ERROR query...\n");
int ct = 0;
int i = 0, querynum = 0;
char input[LIM][SIZE];
while (ct < LIM&&fs_gets(input[ct], SIZE, query) != NULL && input[ct][0] != '\0') ct++;
//for(i=0;i<ct;i++) printf("%s",input[i]);
for (i = 0, filect = 1; i < total; i++, filect++)
{
sprintf(filename, "00%d.txt", filect);//文件名的递增
cts = 0;
if ((fp = fopen(filename, "r")) == NULL) printf("ERROR filename...\n");
while ((fscanf(fp, "%s", test) != EOF) && (compare(test, flag) != 0));
while (1)
{
c = fgetc(fp);
if (c == EOF) break;
txt[cts] = c;
cts++;
}
for (querynum = 0; querynum < ct; querynum++)
{
if (Num0fstr(txt, ptstr[querynum]) != 0)
fprintf(output, "%d %d\n", querynum, i);
}
txt[0] = '\0';
fclose(fp);
fclose(output);
printf("%d\n", filect);
}
fclose(query);
}
fclose(output);
}
free(txt);
printf("Wanna again? Input 'y' to continue or 'n' to quit\n");
//while(getchar()!='\n') continue;
ch = getchar();
}
return 0;
}
int exactSearch(int n)
{
char input[LIM][SIZE];
char filename[SIZE];
char * ptstr[LIM];
FILE * fp;
char test[20];
int ct = 0, i = 0, process = 0, filect, total;
_Bool bool = 0;
int cts, count, fbool;
char txt[1024 * SIZE], c;
FILE * query, *exactSearchout;
exactSearchout = fopen("exactSearchout.txt", "w");
printf("Enter the number of files you want to test\n");
scanf("%d", &total);
if (n == 1)
{
printf("Now please input your query term,separated by space and confirm by enter key.NO more than %d\n", LIM);
getchar();
while (ct < LIM&&fgets(input[ct], SIZE, stdin) != NULL && input[ct][0] != '\n')
{
input[ct][strlen(input[ct]) - 1] = '\0';
ct++;
//ptstr[ct]=input[ct];
}
for (i = 0, filect = 1; i < total; i++, filect++)
{
bool = 0;
if (i < 9) sprintf(filename, "00%d.txt", filect);
if (i < 99 && i>10) sprintf(filename, "0%d.txt", filect);
if (i > 100 && i < total) sprintf(filename, "%d.txt", filect);
//printf("%s\n",filename);
if ((fp = fopen(filename, "r")) == NULL)
printf("ERROR filename...\n");
cts = 0;
while ((fscanf(fp, "%s", test) != 0) && compare(test, flag) != 0);
while (1)
{
c = fgetc(fp);
if (c == EOF)
break;
txt[cts] = c;
cts++;
}
//system("PAUSE");
for (count = 0, fbool = 0; count < ct; count++)
{
if (Num0fstr(txt, input[count]) != 0) fbool++;
}
printf("%d%d\n", fbool, count);
if (fbool == count) bool = 1;
if (bool)
fprintf(exactSearchout, "%s\n", filename);
while (txt[cts] != '\0')
{
txt[cts] = '\0'; cts++;
}
//printf("%d\n",filect);
fclose(fp);
}
fclose(exactSearchout);
}
else
{
query = fopen("query.txt", "r");
while (ct < LIM&&fscanf(query, "%10s", input[ct]) == 1 && input[ct][0] != '\0')
{
ptstr[ct] = input[ct];
ct++;
}
for (i = 0, ct = 0; i < 510; i++, ct++)
{
sprintf(filename, "%d.txt", i);;
fp = fopen(filename, "r");
exactSearchout = fopen("exactSearchout.txt", "w");
while ((fscanf(fp, "%s", test) != EOF) && compare(test, flag) != 0);
for (int count = 0, fbool = 0; count < ct; count++)
{
while (fscanf(fp, "%s", test) != 0)
{
if (strpbrk(ptstr[count], test) != NULL) { fbool++; break; }
}
if ((fbool - 1) == count) bool = 1;
}
if (bool) fprintf(exactSearchout, "%d\n", filename);
fclose(fp);
process++;
printf("%d succeed...%d remain...Please wait.\n", process, 510 - process);
}
fclose(exactSearchout);
fclose(query);
}
return 0;
}
int topSearch(int n)
{
char filename[LIM];
char input[LIM][SIZE];
char * ptstr[LIM], test[20];
int ct = 0, c = 0, total = 0, process = 0;
int i = 0, ret_val, sep[510], len;
FILE * fp, query, topSearchout;
_Bool bool = 0;
int count = 0, fbool = 0;
if (n == 1)
{
printf("Now please input your query one by one...NO more than %d\n", LIM);
while (ct < LIM&&fgets(input[ct], SIZE, stdin) != NULL && input[ct][0] != '\n')
{
input[ct][strlen(input[ct]) - 1] = '\0';
ct++;
}
int sep[510] = { 0 };
for (i = 0; i < 510; i++, c++)
{
sprintf(filename, "%d.txt", i);
topSearchout = fopen("topSearchout.txt", "w");
while (fscanf(fp, "%s", test) != EOF)
{
for (count = 0, fbool = 0; count < ct; count++)
if (compare(ptstr[count], test) == 0) sep[i]++;
}
fclose(fp);
fclose(topSearchout);
process++;
}
ret_val = maxnum(sep);
fprintf(topSearchout, "%d.txt", ret_val + 1);
}
else
{
query = fopen("query.txt", "r");
while (ct < LIM&&fscanf(query, "%10s", input[ct]) == 1 && input[ct][0] != '\0')
{
ptstr[ct] = input[ct];
ct++;
}
for (i = 0, ct = 0; i < 510; i++, ct++)
{
sprintf(filename, "%d.txt", i);
fp = fopen(filename, "r");
topSearchout = fopen("topSearchout.txt", "w");
while ((fscanf(fp, "%s", test) != EOF) && compare(test, flag) != 0);
for (count = 0, fbool = 0; count < ct; count++)
{
while (fscanf(fp, "%s", test) != EOF)
{
if (compare(ptstr[count], test) == 0) fbool++;
if ((fbool - 1) == count) bool = 1;
}
}
if (bool) fprintf(topSearchout, "%s", filename);
fclose(fp);
process++;
printf("%d succeed...%d remain...Please wait.\n", process, 510 - process);
}
fclose(topSearchout);
fclose(query);
ret_val = maxnum(sep);
printf("%d", ret_val + 1);
}
return 0;
}
int maxnum(int ar[])
{
int j = 1, maxnum1 = 0;
int max;
for (; j < 510; j++)
if (max < ar[j]) { max = ar[j]; maxnum1 = j; }
return maxnum1;
}
int topKSearch(int n)
{
char filename[LIM];
char input[LIM][SIZE];
char * ptstr[LIM], test[20];
int ct = 0, c = 0, total = 0;
int i = 0, ret_val, process = 0;
FILE * query, fp, topKSearchout;
_Bool bool = 0;
int sep[510];
int count = 0, fbool = 0;
if (n == 1)
{
printf("Now please input your query one by one...NO more than %d\n", LIM);
while (ct < LIM&&fgets(input[ct], SIZE, stdin) != NULL && input[ct][0] != '\n')
{
input[ct][strlen(input[ct]) - 1] = '\0';
ct++;
}
int sep[510] = { 0 };
for (i = 0; i < 510; i++, c++)
{
sprintf(filename, "%d.txt", i);
fp = fopen(filename, "r");
topKSearchout = fopen("topKSearchout.txt", "w");
while ((fscanf(fp, "%s", test) != EOF) && compare(test, flag) != 0);
while (fscanf(fp, "%s", test) != EOF)
{
for (int count = 0, fbool = 0; count < ct; count++)
if (compare(ptstr[count], test) == 0) sep[i]++;
}
fclose(fp);
process++;
printf("%d succeed...%d remain...Please wait.\n", process, 510 - process);
}
qsort(sep, 510, sizeof(int), max);
fprintf(topKSearchout, "%d%d%d", sep[0], sep[1], sep[2]);
fclose(topKSearchout);
}
else
{
query = fopen("query.txt", "r");
while (ct < LIM&&fscanf(query, "%10s", input[ct]) == 1 && input[ct][0] != '\0')
{
ptstr[ct] = input[ct];
ct++;
}
for (i = 0, ct = 0; i < 510; i++, ct++)
{
sprintf(filename, "%d.txt", i);
fp = fopen(filename, "r");
topKSearchout = fopen("topKSearchout.txt", "w");
while ((fscanf(fp, "%s", test) != EOF) && compare(test, flag) != 0);
for (int count = 0, fbool = 0; count < ct; count++)
{
while (fscanf(fp, "%s", test) != EOF)
{
if (compare(ptstr[count], test) == 0) fbool++;
if ((fbool - 1) == count) bool = 1;
}
}
if (bool) fprintf(topKSearchout, "%s", filename);
fclose(fp);
process++;
printf("%d succeed...%d remain...Please wait.\n", process, 510 - process);
}
fclose(query);
qsort(sep, 510, sizeof(int), max);
fprintf(topKSearchout, "%d%d%d", sep[0], sep[1], sep[2]);
fclose(topKSearchout);
}
return 0;
}
int max(const void a, const void b)
{
const int p1 = (const int )a;
const int p2 = (const int )b;
if (p1 > p2)
return -1;
else if (p1 == p2)
return 0;
else return 1;
}
char * s_gets(char st, int n)
{
char ret_val;
int i = 0;
ret_val = fgets(st, n, stdin);
if (ret_val)
{
while (st[i] != '\n'&&st[i] != '\0') i++;
if (st[i] == '\n') st[i] = '\0';
else while (getchar() != '\n') continue;
}
return ret_val;
}
char * fs_gets(char st, int n, FILE fp)
{
char *ret_val;
int i = 0;
ret_val = fgets(st, n, fp);
if (ret_val)
{
while (st[i] != '\n'&&st[i] != '\0') i++;
if (st[i] == '\n') st[i] = '\0';
else while (getchar() != '\n') continue;
}
return ret_val;
}
int compare(char p1, char p2)
{
int r, t;
for (; p1 == p2 && (p1 != '\0' || p2 != '\0'); p1++, p2++);
r = p1 - p2;
return (r == 0) ? (t = 0) : (t = 1);
}
int Num0fstr(char Mstr, char substr)
{
int number = 0;
char p;
char q;
while (Mstr != '\0')
{
p = Mstr;
q = substr;
while ((p == q) && (p != '\0') && (q != '\0'))
{
p++;
q++;
}
if (q == '\0') number++;
Mstr++;
}
return number;
}
你被著名的武汉IT初创公司HackIT聘为工程师,这家公司最近获得了B轮融资。HackIT正在开发一种新的搜索引擎,他们希望将其集成到他们的信息管理和数据挖掘产品中。特别是HackIT的项目集中在financial新闻文章寻找相关信息。该产品将引起金融分析师、投资银行家和记者的兴趣。分配给你的工程团队应该构建first原型的核心搜索技术。搜索引擎开发的设计还处于早期阶段,但是HackIT的技术部门已经概述了它的基本功能(参见算法1中的伪代码)。
您的任务是实现基本的核心算法,并使用高级特性对其进行扩展。
你的任务包括:
核心搜索引擎的实现
搜索引擎是HackIT的一项关键任务核心技术。服务于数以百万计的并发用户搜索需要高性能-快速的程序执行。HackIT的CTO Dr. Dom要求您在不依赖第三方库的情况下使用纯C实现整个项目。
增强核心搜索引擎使用高级搜索功能
以市场HackIT高科技的搜索技术,您需要扩展核心算法与高级功能。
测试
为了确保高质量的服务,我们为您提供了一个基准集,该基准集允许您测试算法是否按预期工作。
HackIT数据团队为您的搜索引擎准备了一个测试数据集,您可以从他们的开发服务器http://chinabigdatatraining.com/dataset/dataset.zip的归档文件中下载该测试数据集。数据是HackIT属性,该属性拥有该数据的所有权利,但授予您在实现其搜索引擎时使用该数据的特权。HackIT不允许您误用或分发它们的数据。
一个数据采集团队成员指出,文档id是由文档名称(excluding denedle extension).
例如:
document. txt→文档id: documentX
此外,他还指出,每个文档的开头都包含与您的任务无关的edna代码。每个文档以CONTENT关键词为开始标志。
Term definition
手头的任务要求您实现一个信息检索系统。因此,您将接触到信息检索、数据挖掘和投标数据应用领域的研究术语。以下部分将介绍理解作业所需的常用术语。
搜索引擎
在数据集中搜索和标识与用户指定的关键字或字符相对应的信息的一种程序。商业网站的一个例子是谷歌或百度,它专注于在万维网上搜索特定的网站。
查询
由搜索引擎用户输入的一组搜索关键词。该查询表达了搜索引擎用户的信息需求。例如,用户可能对特定手机的产品发布信息感兴趣。他或她的搜索查询可能是这样的:“iphone 2019年发布日期”
查询项
搜索组成查询的术语。示例查询“iphone 2019发布日期”由5个查询词组成,它们共同构成了搜索查询,表达了搜索引擎用户的信息需求。
脚本
计算机脚本是由某个程序或脚本引擎执行的命令的列表。
文档ID
文件的唯一标识符,即:准确描述一个文档的数字或术语。
查询ID
查询的唯一标识符,即:准确描述一个查询的数字或术语。
错误
软件缺陷是一个错误,瑕疵,失败或故障的计算机程序或系统,导致其身体产生不正确的或意想不到的结果,或意想不到的运行。
任务
你是HackIT的软件工程师,负责实现他们的搜索引擎。在执行搜索引擎时,应该要求用户使用命令行参数指定“搜索模式”。用户可以选择两种模式:
手册指导搜索
手动引导搜索(命令行关键字“Manual”)允许用户手动(手动)在搜索引擎中键入查询项。提交搜索词(按下enter键)后,搜索引擎应该将所有文档id打印到与至少一个查询词匹配的屏幕上。//读取回车键
脚本引导搜索
脚本引导搜索(命令行关键字“脚本”)允许用户创建一个名为“查询”的查询文件。txt”在与搜索引擎相同的文件位置。查询文件包含由查询ID标识的搜索查询(每行1个)。
示例查询。txt: q1 house mouse
q2 star car bar
在本例中,q1和q2是查询1和查询2的查询id。查询1 (q1)包含2个查询词(house and mouse),查询2包含3个查询词(star, car, bar)
执行以下步骤:
创建源文件(1点)
创建一个名为searchEngineXXX.c的源代码文件(XXX应该替换为您的学生证)。源代码文件保存了搜索引擎的全部代码。
3.实现核心搜索引擎算法(40分)
实现核心搜索引擎,提供两种搜索模式-手动和脚本。搜索模式应使用“manual”或“script”选项指定为命令行参数。算法1说明了核心搜索引擎的伪代码。
高级搜索引擎I -精确搜索(15分)
通过实现一个精确搜索特性作为命令行选项“exactSearch”来扩展核心搜索引擎。此功能应适用于两种搜索模式(手动和脚本),并与所有其他高级搜索功能兼容。如果搜索引擎使用“exactSearch”命令行参数执行,则只打印那些与所有查询项匹配的文档。
例如:
d1 {a, b, c}
d2 {a, g, d}
d3 {a, b}
d4 {a, b, r, f}
q1 {a, b, f}
当“exactSerach”处于活动状态时,应该只打印文档d4,因为它是唯一匹配查询q1的所有查询项的文档。例如: q1的所有查询项(a, b, f)也出现在d4中。
高级搜寻引擎II -顶端搜寻(15分)
通过实现top搜索特性作为命令行选项“topSearch”来扩展搜索引擎。此功能应适用于两种搜索模式(手动和脚本),并与所有其他高级搜索功能兼容。如果搜索引擎使用“topSearch”命令行参数执行,则只将包含最多查询项(查询与文档之间重叠最大)的文档打印到屏幕上。
例如:
d1 {a,a,a,a,c } d2 { a,g,d,d } d3 {a,b,x} 4 { a,b,r,f }
q1 {a, b, x}
当“topSearch”处于活动状态时,搜索查询q1时,只需要将文档d1打印到屏幕上。注意,文档d3匹配q1中的所有查询项,但是d3和q1的重叠部分为3,而d1和q1 = 4的重叠部分为重复查询项。
7.高级搜索引擎III - Top KSearch(15分)
扩展您的搜索引擎,实现排名搜索功能作为命令行选项“topKSearch”。此功能应适用于两种搜索模式(手动和脚本),并与所有其他高级搜索功能兼容。如果搜索引擎使用“topKSearch”命令行参数执行,则打印与搜索查询重叠最大的3个文档。
例如:
d1 { a,b,c } d2 { a、g、d } d3 { a,b,f,d } d4 { a,b,b r f } d5 { g r d }
q1 {a, b, f}
使用查询q1搜索文档d1 - d5时,应打印以下3个结果:
q1 d4
q1 d3
q1 d1
注意,这里的文档顺序并不重要。前3个文档是由查询和文档之间的重叠决定的:
d4与q1 = 4的重叠;d3&q1 = 3;d1&q1 = 2;d2&q1 = 1;d5&q1 = 0;
6输出格式
本节定义脚本搜索模式所需的输出格式。重要的是,搜索引擎的脚本模式必须正确地遵循输出格式,以确保可以评估算法。当您的核心算法以脚本模式处理某个查询并找到包含一个或多个查询项的文档时,您需要将以下行打印到屏幕上:
query-ID document-ID
每个查询/文档ID对应该在它自己的行中,并由一个空格(空白)分隔。手动模式的输出格式由您决定,不再由HackIT进一步指定。
7日提交
数字提交:
为了评估您的性能,您需要提交源代码文件(不是可执行文件)。评估结果是基于您的算法的正确实现和结果。分的总数是100,分布在各个搜索引擎的功能(如在第三节中指定)。HackIT的技术部门将评估你的搜索引擎通过自动化测试,你需要密切关注正确的源代码文件名命名以及所有的命令行参数和输出格式。按照本文档所述准确命名所有函数。
注意:如果您不能满足准确的命名约定,自动评估脚本将失败,您的提交结果将是负面的。
HackIT将于下周通过微信群发布提交系统。您将获得一个在线提交系统的链接,包括您的用户名和密码。您必须在课程结束前,使用所提供的提交链接连同您的用户名和密码提交您的源代码的数字版本。
物理提交:
除了以电子方式提交你的源代码外,你还必须在课程结束前缴交一份实验报告的实体(印刷)版本。
School of Information Management C Laboratory Winter 2018
Final Project
created by Dominik Wurzer November 22, 2018
1 Introduction
You are hired as an Engineer by HackIT - the infamous Wuhan IT start-up which recently secured their series B funding. HackIT is developing a new search engine which they want to incorporate into their information management and data mining products. In particular, HackIT's project focuses on searching nancial news articles for relevant information. The product will be of interest to nancial analysts, investment bankers and journalists. You are assigned to the engineering team that is supposed to build the rst prototype of the core search technology. The design of the search engine development is in its early stage but HackIT's technology division already outlined its basic functionality (see pseudo code in Algorithm 1). Your task is to implement the basic core algorithm and extend it with advanced features.
Your tasks includes:
Implementation of the core search engine The search Engine is a mission critical core technology of HackIT. Serving millions of concurrent user searches requires high performance fast program execution. HackIT's CTO Dr. Dom requires you to implement the entire project in plain C without relying on 3rd party libraries.
Enhancing the core search engine with advanced search features In order to market HackIT's search technology as high-tech, you are required to extend the core algorithm with advanced features.
Testing To ensure a high quality of service, you are provided with a benchmark set that allows you to test whether your algorithm in working as expected.
1
C Laboratory School of Information Management
2 Data
The HackIT data team prepared a test data set for your search engine, which you can download in an archive form their development server: http://chinabigdatatraining.com/dataset/dataset.zip. The data is HackIT property, which owns all rights to it but grants you the privilege to use it for the implementation of their search engine. HackIT does not allow you to miss-use or distribute their data.
A data acquisition team member notes that the document-ID is dened by the document name (excluding the le extension).
For example:
documentX.txt → document-ID: documentX
Further he states that the beginning of each document contains EDNA-codes that are NOT relevant for your task. The content of each document starts AFTER the CONTENT key word.
For example:
Figure 1: Example document. The search engine should only consider the terms after the CONTENT tag (framed by the green line) and ignore anything before and including the CONTENT tag (framed by the red line)
2
C Laboratory School of Information Management
3 Term Denition
The task at hand requires you to implement an information retrieval system. Consequently, you are exposed to terminology of research elds covering Information Retrieval, Data Mining and Bid Data applications in general. The following section introduces common terms needed to understand your assignment.
Search Engine A program that searches for and identies information in a data set that correspond to keywords or characters specied by the user. An example for a commercial website is Google or Baidu which focus on searching particular web sites on the World Wide Web.
Query A set of search key words entered by the user of a search engine. The queury expresses the information need of the search engine user. For example, a user might be interested in information about a product release of a particular phone. His or her search query might look like this: "release date of iphone 2019"
Query Terms Search terms that make up the query. The example query "release date of iphone 2019" consists of 5 query terms that together make up the search query and express the information need of the search engine user.
Script A computer script is a list of commands that are executed by a certain program or scripting engine.
Document ID A unique identier for documents, i.e.: a number or term that describes exactly 1 document.
Query ID A unique identier for queries, i.e.: a number or term that describes exactly 1 query.
Bug A software bug is an error, aw, failure or fault in a computer program or system that causes it to produce an incorrect or unexpected result, or to behave in unintended ways.
3
C Laboratory School of Information Management
4 Tasks
You are a software engineer at HackIT and tasked to implement their search engine. When executing the search engine the user should be required to specify the "search mode" using a command line parameter. The user should be able to choose from 2 modes:
Manual Guided Search Manual guided search (command line key-word "manual") allows the user to type query terms into the search engine by hand (manually). After submitting the search terms (enter key pressed) the search engine should print all document IDs to the screen that match at least one of the query terms.
Script Guided Search Script guided search (command line key-word "script") allows the user to create a query le called "query.txt" in the same le location as the search engine. The query le contains search queries (1 per line) identied by the query ID.
Example query.txt: q1 house mouse q2 star car bar
In this example q1 and q2 are the query-IDs of query 1 and query 2. Query 1 (q1) consists of 2 query terms (house and mouse), whereas query 2 consists of 3 query terms (star, car, bar)
Carry out the following steps:
.1. Acquire Data Set Create a new working directory called "SearchEngine" and download the data set. Unpack the archive in your working directory and inspect the les manually. Pay attention to which part of the documents are relevant to your search engine, as stated in section Data.
.2. Creating Source File (1 Point) Create a source code le named searchEngineXXX.c (XXX should be replaced with your student-ID). The source code le holds the entire code for your search engine.
.3. Implementing the Core Search Engine Algorithm (40 Points) Implement the core search engine that oers two searching modes manual and script. The search mode should be specied as a command line parameter using the option "manual" or "script". Algorithm 1 illustrates the pseudo code of the core search engine.
4
C Laboratory School of Information Management
Algorithm 1 : Pseudo Code Search Engine HackIT 1: check command line parameter for search mode 2: if search mode = "manual" then 3: ask user to enter query term (qt), separated by space and conrm by enter key 4: query array QA = {qt} /∗→ each term is one element in QA∗/ 5: for all document d ∈ training data set do 6: for all terms t ∈ d do 7: if t ∈ QA then 8: print id of document d to screen (one per line) 9: end-if 10: end-for 11: end-for 12: else 13: if search mode = "script" then 14: read query le 15: for all query q ∈ query le do 16: query array QA = {query term qt ∈ q} /∗ → each term is one element in QA∗/17: for all document d ∈ training data set do18: for all terms t ∈ d do19: if t ∈ QA then20: print id of q and d to screen (one per line) 21: end-if 22: end-for 23: end-for 24: end-for 25: end-if 26: end-if
5
C Laboratory School of Information Management
.4. Correcting the Core Search Engine Algorithm (14 Points) When creating the pseudo code for the search engine, the engineers of HackIT made a mistake as their version of the algorithm prints the document ID for each query term that matches a document term. This is of course a bug and not acceptable. You are required to ensure that each document ID is only printed a single time to screen, independently of the number of query terms that match it.
.5. Advanced Search Engine I - Exact Search (15 Points) Extend the core search engine by implementing an exact search feature as a command line option "exactSearch". This feature should be applicable for both search modes (manual & script) and compatible with all other advanced search features. If the search engine is executed with "exactSearch" command line parameter, print only those documents to screen that match ALL query terms.
For example: d1{a,b,c} d2{a,g,d} d3{a,b} d4{a,b,r,f} q1{a,b,f} When "exactSerach" is active only document d4 should be printed as it is the only document that matches all query terms of query q1, i.e: all query terms (a,b,f) of q1 also occur in d4.
.6. Advanced Search Engine II - Top Search (15 Points) Extend your search engine by implementing a top search feature as a command line option "topSearch". This feature should be applicable for both search modes (manual & script) and compatible with all other advanced search features. If the search engine is executed with "topSearch" command line parameter, print only the document containing the most amount of query terms (biggest overlap between query and document) to the screen.
For example: d1{a,a,a,a,c} d2{a,g,d,d} d3{a,b,x} d4{a,b,r,f} q1{a,b,x} When "topSearch" is active only document d1 should be printed to the screen when searching for query q1. Note that document d3 matches all query terms in q1 but the overlap of d3 and q1 = 3, whereas the overlap of d1 and q1 = 4 as repeated query terms are taken into account.
6
C Laboratory School of Information Management
.7. Advanced Search Engine III - Top K Search (15 Points) Extend your search engine by implementing a ranked search feature as a command line option "topKSearch". This feature should be applicable for both search modes (manual & script) and compatible with all other advanced search features. If the search engine is executed with "topKSearch" command line parameter, print the 3 documents with the biggest overlap with the search query.
For example: d1{a,b,c} d2{a,g,d} d3{a,b,f,d} d4{a,b,b,r,f} d5{g,r,d} q1{a,b,f} When searching documents d1 - d5 with query q1, you should print the following 3 results: q1 d4 q1 d3 q1 d1
Note that the document order is not important here. The top 3 documents are determined by the overlap between the query and documents: overlap of d4&q1 = 4; d3&q1 = 3; d1&q1 = 2; d2&q1 = 1; d5&q1 = 0;
5 Benchmark Testing
HackIT provides its engineers with a benchmark set of queries with their corresponding correct search results. You can download this benchmark set from their development server: http://chinabigdatatraining.com/dataset/testset.zip. When implementing your search engine, you can run it on the data set and with the test queries provided by the benchmark set and compare your results.
7
C Laboratory School of Information Management
6 Output Format
This section denes the required output format for the script search mode. It is important that the script mode of your search engine follows the output format correctly to ensure that your algorithm can be assessed. Whenever your core algorithm processes a certain query in script mode and nds a document that contains one or more of the query terms, you are required to print the following line to the screen:
query-ID document-ID
Each query/document ID pair should be in its own line and separated by a single space (blank). The output format of the manual mode is up to you and not further specied by HackIT.
7 Submission
Digital Submission: In order to evaluate your performance you are required to submit your the source code le (NOT executable). The evaluation result is based on the correct implementation and result of your algorithm. The total number of possible points is 100, which are distributed across the individual functions of your search engine (as specied in Section 3). The technology division of HackIT will evaluate your search engine by automated testing, which requires you to pay close attention to the correct naming of the source code lename as well as all command line parameters and output format. Name all function exactly as stated in this document.
NOTE: If you fail to meet the exact naming convention the automated evaluation script will fail and your submission result will be negative.
HackIT will release the submission system by next week via WeChat group. You will be provided with a link to an online submission system including your username and password. You are required to submit a digital version of your source code using the provided submission link in combination with your username and password before the end of the course.
Physical Submission: In addition to the digital submission of your source code, you are required to hand in a physical (printed) version of your lab report before the end of the course.
HackIT's head of technology division Dr. Dom wishes you good luck, fun and
如果有大佬能重新写这题,并且在28号之前完成! 有现金报酬!
如果有大佬能重新写这题,并且在28号之前完成! 有现金报酬!
– Sincerity8099 5年前