# This is the configuration file for the NewsLookout web scraping application.
#
# It is organised into 4 sections: installation, operation, logging and plugins.
#
##################################################################################################
#                                                                                                #
# Notice:                                                                                        #
# This software is intended for demonstration and educational purposes only. This software is    #
# experimental and a work in progress. Under no circumstances should these files be used in      #
# relation to any critical system(s). Use of these files is at your own risk.                    #
#                                                                                                #
# Before using it for web scraping any website, always consult that website's terms of use.      #
# Do not use this software to fetch any data from any website that has forbidden use of web      #
# scraping or similar mechanisms, or violates its terms of use in any other way. The author is   #
# not liable for such kind of inappropriate use of this software.                                #
#                                                                                                #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,            #
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR       #
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE      #
# FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR           #
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER         #
# DEALINGS IN THE SOFTWARE.                                                                      #
#                                                                                                #
##################################################################################################


# this section lists the file and directory locations

# The configuration directory is specified here:
conf_dir ="conf"

# The data_dir specifies the name of the directory where the data files will be saved:
data_dir =  "data/files"

# the master data files will be stored in this folder (e.g. those datasets that are appended daily)
master_data_dir = "data/master_data"

models_dir =  "models"

# location of the log file, here it is specified relative to the data directory:
log_file = "logs/newslookout.log"

# If a PID (process identifier) file exists, the application will not launch.
# This is to prevent multiple instances being launched on the same machine.
# As part of the shutdown sequence, the PID file will be automatically deleted.
pid_file = "logs/newslookout.pid"

# the sqlite data file that stores the history of previously retrieved URLs
#completed_urls_datafile = "/media/shared/newslookout_urls.db"
completed_urls_datafile = "data/newslookout_urls.db"

cookie_file="data/cookies.txt"

# levels of recursion to follow links for identifying news articles within websites
# min value is 1, max is 4, any other values do not have any effect
recursion_level=1

# proxy server url (protocol:host:port)
# will attempt to use proxy if this is configured
#proxy_server_url = "http://127.0.0.1:3128/"

# time in seconds to wait for when retrieving a page:
fetch_timeout = 60

# time to wait to establish TCP connection:
connect_timeout = 10

# number of times to retry connecting if failed
retry_count = 3

# the fixed number of seconds to wait between web fetches, this
# fixed time is added to the random time to determine the total wait time
# between two web fetches to the same URL 
retry_wait_fixed_sec = 3

# should raw html be saved as compressed bzipped files?
save_html=true
#save_html=false

# the user agents to use for the web scraper's HTTP(S) requests:
user_agent="Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"

# log levels can be one of the following
# (starting from less verbosity to full verbosity): ERROR, WARN, INFO, or DEBUG
# log_level="ERROR"
# log_level="WARN"
log_level="INFO"
#log_level="DEBUG"

# max file size of the log file, when the log file grows
# beyond this size limit, it will be rotated:
max_logfile_size=2048000

# this many backups of the log file will be retained
logfile_backup_count=30

# In this section, list the names of all modules to be enabled:
# the number assigned to the priority attribute indicates execution priority,
# A lower number indicates higher priority, so it will be run before others are run
plugins = [
   {"enabled"=true, "name"="mod_in_gdelt", "type"="retriever", "priority"=1}
 , {"enabled"=true, "name"="mod_in_nse", "type"="retriever", "priority"=2}
 , {"enabled"=true, "name"="mod_in_bse", "type"="retriever", "priority"=2}
 , {"enabled"=true, "name"="mod_en_in_inexp_business", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_ndtv", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_business_std", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_livemint", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_timesofindia", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_moneycontrol", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_hindu", "type"="retriever", "priority"=3}
 , {"enabled"=true, "name"="mod_en_in_indiankanoon", "type"="retriever", "priority"=5}
 , {"enabled"=true, "name"="mod_en_in_trak", "type"="retriever", "priority"=4}
 , {"enabled"=true, "name"="mod_en_in_forbes", "type"="retriever", "priority"=4}
 , {"enabled"=true, "name"="mod_en_in_ecotimes", "type"="retriever", "priority"=3}
 , {"enabled"=false, "name"="mod_en_in_rbi", "type"="retriever", "priority"=1, "max_pages"=10, "items_per_page"=10}
 , {"enabled"=true, "name"="mod_offline_docs", "type"="retriever", "priority"=3, "file_extension"="json", "folder_name"="data/files", "published_in_past_days"=999999}
# , {"enabled"=true, "name"="mod_offline_docs", "type"="retriever", "priority"=3, "file_extension"="pdf", "folder_name"="data/pdf_files", "published_in_past_days"=999999}
 # the following are data processing plugins that will be run serially, in order of priority:
 # A lower number indicates higher priority, so it will be run before others are run
 , {"enabled"=true, "name"="split_text", "type"="data_processor", "priority"=1, "overwrite"=false, "min_word_limit_to_split"=700, "previous_part_overlap"=70}
 # it is recommended to keep these plugins disabled, they will slow down the entire application
 # since the models are very large and computationally intensive
 # enable them if you are running the application on very capable hardware > 16GB RAM and 8 CPU:
 , {"enabled"=false, "name"="mod_dedupe", "type"="data_processor", "priority"=2, "spacymodel"="en_core_web_lg"}
 # For the news event tone classification model - finBERT:
 # Download the models from:
 # https://gohkust-my.sharepoint.com/:f:/g/personal/imyiyang_ust_hk/EksJcamJpclJlbMweFfB5DQB1XrsxURYN5GSqZw3jmSeSw?e=KAyhsX
 # obtain the vocabulary file from:
 # https://gohkust-my.sharepoint.com/:t:/g/personal/imyiyang_ust_hk/EX3C-KM9bTxOjdttsPslLZUBw_mh9Jdh8PB0WTv6b2tEIA?e=DYBVJY
 # save the model file and the config.json file to the folders specified here:
 , {"enabled"=true, "name"="mod_classify", "type"="data_processor", "priority"=3, "mod_eventclass_modelfile"="bert_models/pretrained_weights/pytorch_model.bin", "mod_eventclass_weightspath"="bert_models/pretrained_weights", "mod_eventclass_vocab_path"="bert_models/finbert_vocab" }
 , {"enabled"=false, "name"="mod_summarize", "type"="data_processor", "priority"=5, "llm_service"="ollama", "overwrite"=false, "max_word_count"=850}
 , {"enabled"=true, "name"="mod_solrsubmit", "type"="data_processor", "priority"=7, "host_port"="https://127.0.0.1:3839", "username"="solr"}
 , {"enabled"=true, "name"="mod_vectorstore", "type"="data_processor", "priority"=8, "destination"="vectorindex", "file_format"="json"}
 , {"enabled"=true, "name"="mod_persist_data", "type"="data_processor", "priority"=10, "destination"="file", "file_format"="json"}
 # here, the custom command line plugin needs to run last in the data processing pipeline as it expects to retrieve the document from a file.
 , {"enabled"=true, "name"="mod_cmdline", "type"="data_processor", "priority"=99, "command_name"="C:\\temp\\testecho.cmd"}
]

model_api_timeout=200
save_intermediate=true
model_temperature=0

# for ollama based models, confirm these as each model will have different limits:
max_llm_context_tokens=16384
max_gen_tokens=16384

ollama_svc_url="http://127.0.0.1:11434/api/generate"
ollama_model_name="gemma2"

chatgpt_svc_url="https://api.openai.com/v1/chat/completions"
chatgpt_model_name="gpt-4o-mini"

gemini_svc_url="https://generativelanguage.googleapis.com/v1beta/models/"
gemini_model_name="gemini-1.5-flash"

prompt_summary_part="""Summarize the following text in less than 250 words.
Walk through the text in manageable parts step by step, analyzing, grouping similar topics and summarizing as you go.
TEXT:"""

prompt_summary_exec="""Summarize the following text in less than 500 words.
Walk through the text in manageable parts step by step, analyzing and grouping similar topics and summarizing as you go.
TEXT:"""

system_context = "Act as an expert who is analysing the news."

## end of file ##