# This is the configuration file for the NewsLookout web scraping application. # # It is organised into 4 sections: installation, operation, logging and plugins. # ################################################################################################## # # # Notice: # # This software is intended for demonstration and educational purposes only. This software is # # experimental and a work in progress. Under no circumstances should these files be used in # # relation to any critical system(s). Use of these files is at your own risk. # # # # Before using it for web scraping any website, always consult that website's terms of use. # # Do not use this software to fetch any data from any website that has forbidden use of web # # scraping or similar mechanisms, or violates its terms of use in any other way. The author is # # not liable for such kind of inappropriate use of this software. # # # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR # # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE # # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # # DEALINGS IN THE SOFTWARE. # # # ################################################################################################## # this section lists the file and directory locations # The configuration directory is specified here: conf_dir ="conf" # The data_dir specifies the name of the directory where the data files will be saved: data_dir = "data/files" # the master data files will be stored in this folder (e.g. those datasets that are appended daily) master_data_dir = "data/master_data" models_dir = "models" # location of the log file, here it is specified relative to the data directory: log_file = "logs/newslookout.log" # If a PID (process identifier) file exists, the application will not launch. # This is to prevent multiple instances being launched on the same machine. # As part of the shutdown sequence, the PID file will be automatically deleted. pid_file = "logs/newslookout.pid" # the sqlite data file that stores the history of previously retrieved URLs #completed_urls_datafile = "/media/shared/newslookout_urls.db" completed_urls_datafile = "data/newslookout_urls.db" cookie_file="data/cookies.txt" # levels of recursion to follow links for identifying news articles within websites # min value is 1, max is 4, any other values do not have any effect recursion_level=1 # proxy server url (protocol:host:port) # will attempt to use proxy if this is configured #proxy_server_url = "" # time in seconds to wait for when retrieving a page: fetch_timeout = 60 # time to wait to establish TCP connection: connect_timeout = 10 # number of times to retry connecting if failed retry_count = 3 # the fixed number of seconds to wait between web fetches, this # fixed time is added to the random time to determine the total wait time # between two web fetches to the same URL retry_wait_fixed_sec = 3 # should raw html be saved as compressed bzipped files? save_html=true #save_html=false # the user agents to use for the web scraper's HTTP(S) requests: user_agent="Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14" # log levels can be one of the following # (starting from less verbosity to full verbosity): ERROR, WARN, INFO, or DEBUG # log_level="ERROR" # log_level="WARN" log_level="INFO" #log_level="DEBUG" # max file size of the log file, when the log file grows # beyond this size limit, it will be rotated: max_logfile_size=2048000 # this many backups of the log file will be retained logfile_backup_count=30 # In this section, list the names of all modules to be enabled: # the number assigned to the priority attribute indicates execution priority, # A lower number indicates higher priority, so it will be run before others are run plugins = [ {"enabled"=true, "name"="mod_in_gdelt", "type"="retriever", "priority"=1} , {"enabled"=true, "name"="mod_in_nse", "type"="retriever", "priority"=2} , {"enabled"=true, "name"="mod_in_bse", "type"="retriever", "priority"=2} , {"enabled"=true, "name"="mod_en_in_inexp_business", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_ndtv", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_business_std", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_livemint", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_timesofindia", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_moneycontrol", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_hindu", "type"="retriever", "priority"=3} , {"enabled"=true, "name"="mod_en_in_indiankanoon", "type"="retriever", "priority"=5} , {"enabled"=true, "name"="mod_en_in_trak", "type"="retriever", "priority"=4} , {"enabled"=true, "name"="mod_en_in_forbes", "type"="retriever", "priority"=4} , {"enabled"=true, "name"="mod_en_in_ecotimes", "type"="retriever", "priority"=3} , {"enabled"=false, "name"="mod_en_in_rbi", "type"="retriever", "priority"=1, "max_pages"=10, "items_per_page"=10} , {"enabled"=true, "name"="mod_offline_docs", "type"="retriever", "priority"=3, "file_extension"="json", "folder_name"="data/files", "published_in_past_days"=999999} # , {"enabled"=true, "name"="mod_offline_docs", "type"="retriever", "priority"=3, "file_extension"="pdf", "folder_name"="data/pdf_files", "published_in_past_days"=999999} # the following are data processing plugins that will be run serially, in order of priority: # A lower number indicates higher priority, so it will be run before others are run , {"enabled"=true, "name"="split_text", "type"="data_processor", "priority"=1, "overwrite"=false, "min_word_limit_to_split"=700, "previous_part_overlap"=70} # it is recommended to keep these plugins disabled, they will slow down the entire application # since the models are very large and computationally intensive # enable them if you are running the application on very capable hardware > 16GB RAM and 8 CPU: , {"enabled"=false, "name"="mod_dedupe", "type"="data_processor", "priority"=2, "spacymodel"="en_core_web_lg"} # For the news event tone classification model - finBERT: # Download the models from: # https://gohkust-my.sharepoint.com/:f:/g/personal/imyiyang_ust_hk/EksJcamJpclJlbMweFfB5DQB1XrsxURYN5GSqZw3jmSeSw?e=KAyhsX # obtain the vocabulary file from: # https://gohkust-my.sharepoint.com/:t:/g/personal/imyiyang_ust_hk/EX3C-KM9bTxOjdttsPslLZUBw_mh9Jdh8PB0WTv6b2tEIA?e=DYBVJY # save the model file and the config.json file to the folders specified here: , {"enabled"=true, "name"="mod_classify", "type"="data_processor", "priority"=3, "mod_eventclass_modelfile"="bert_models/pretrained_weights/pytorch_model.bin", "mod_eventclass_weightspath"="bert_models/pretrained_weights", "mod_eventclass_vocab_path"="bert_models/finbert_vocab" } , {"enabled"=false, "name"="mod_summarize", "type"="data_processor", "priority"=5, "llm_service"="ollama", "overwrite"=false, "max_word_count"=850} , {"enabled"=true, "name"="mod_solrsubmit", "type"="data_processor", "priority"=7, "host_port"="", "username"="solr"} , {"enabled"=true, "name"="mod_vectorstore", "type"="data_processor", "priority"=8, "destination"="vectorindex", "file_format"="json"} , {"enabled"=true, "name"="mod_persist_data", "type"="data_processor", "priority"=10, "destination"="file", "file_format"="json"} # here, the custom command line plugin needs to run last in the data processing pipeline as it expects to retrieve the document from a file. , {"enabled"=true, "name"="mod_cmdline", "type"="data_processor", "priority"=99, "command_name"="C:\\temp\\testecho.cmd"} ] model_api_timeout=200 save_intermediate=true model_temperature=0 # for ollama based models, confirm these as each model will have different limits: max_llm_context_tokens=16384 max_gen_tokens=16384 ollama_svc_url="" ollama_model_name="gemma2" chatgpt_svc_url="https://api.openai.com/v1/chat/completions" chatgpt_model_name="gpt-4o-mini" gemini_svc_url="https://generativelanguage.googleapis.com/v1beta/models/" gemini_model_name="gemini-1.5-flash" prompt_summary_part="""Summarize the following text in less than 250 words. Walk through the text in manageable parts step by step, analyzing, grouping similar topics and summarizing as you go. TEXT:""" prompt_summary_exec="""Summarize the following text in less than 500 words. Walk through the text in manageable parts step by step, analyzing and grouping similar topics and summarizing as you go. TEXT:""" system_context = "Act as an expert who is analysing the news." ## end of file ##