Revision 52054
Added by Miriam Baglioni over 6 years ago
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorPlugin.java | ||
---|---|---|
11 | 11 |
|
12 | 12 |
@Override |
13 | 13 |
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException { |
14 |
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl()); |
|
14 |
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter"));
|
|
15 | 15 |
} |
16 | 16 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
24 | 24 |
|
25 | 25 |
private final ArrayList<String> urls = new ArrayList<>(); |
26 | 26 |
private final ArrayList<String> metas = new ArrayList<String>(); |
27 |
private String filter; |
|
27 | 28 |
|
29 |
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){ |
|
28 | 30 |
|
29 |
public HTTPWithFileNameCollectorIterable(String startUrl){ |
|
30 |
|
|
31 | 31 |
urls.add(startUrl); |
32 |
this.filter = filter; |
|
32 | 33 |
} |
33 | 34 |
|
35 |
private boolean containsFilter(String meta){ |
|
36 |
if (filter == null || filter.isEmpty()) |
|
37 |
return false; |
|
38 |
String[] filter = this.filter.split(";"); |
|
39 |
for(String item:filter){ |
|
40 |
if (meta.contains(item)) |
|
41 |
return true; |
|
42 |
} |
|
43 |
return false; |
|
44 |
} |
|
45 |
|
|
34 | 46 |
private String addFilePath(String meta,String url, boolean isJson){ |
35 | 47 |
String path = url.replace("metadata", "pdf"); |
48 |
|
|
36 | 49 |
try { |
37 | 50 |
if(isJson) |
38 | 51 |
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
39 | 52 |
else{ |
40 |
if (meta.startsWith("<!DOCTYPE")) |
|
41 |
meta = meta.substring(meta.indexOf(">")+1); |
|
42 |
int index = meta.lastIndexOf("</"); |
|
43 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
53 |
|
|
54 |
if (meta.startsWith("<!DOCTYPE")) |
|
55 |
meta = meta.substring(meta.indexOf(">")+1); |
|
56 |
int index = meta.lastIndexOf("</"); |
|
57 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
58 |
|
|
59 |
|
|
44 | 60 |
} |
45 | 61 |
|
46 | 62 |
}catch(Exception ex){ |
... | ... | |
91 | 107 |
if(c.isStatusOk()){ |
92 | 108 |
try { |
93 | 109 |
String ret = c.getResponse(); |
94 |
if (ret != null && ret.length()>0) |
|
110 |
if (ret != null && ret.length()>0 && !containsFilter(ret))
|
|
95 | 111 |
queue.put(addFilePath(ret,url,url.endsWith(".json"))); |
96 | 112 |
} catch (InterruptedException e) { |
97 | 113 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
modules/dnet-collector-plugins/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml | ||
---|---|---|
36 | 36 |
<property name="protocolDescriptor"> |
37 | 37 |
|
38 | 38 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolDescriptor" p:name="HTTPWithFileName"> |
39 |
<!--<property name="params">-->
|
|
40 |
<!--<list>-->
|
|
41 |
<!--<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter"-->
|
|
42 |
<!--p:name="baseURL" />-->
|
|
43 |
<!--</list>-->
|
|
44 |
<!--</property>-->
|
|
39 |
<property name="params">
|
|
40 |
<list>
|
|
41 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter"
|
|
42 |
p:name="filter" />
|
|
43 |
</list>
|
|
44 |
</property>
|
|
45 | 45 |
</bean> |
46 | 46 |
</property> |
47 | 47 |
</bean> |
Also available in: Unified diff
filtering metadata and added param in template to specify what to filter out